Coverage for nova/compute/manager.py: 93%

2# Administrator of the National Aeronautics and Space Administration.

6# Licensed under the Apache License, Version 2.0 (the "License"); you may

7# not use this file except in compliance with the License. You may obtain

8# a copy of the License at

10# http://www.apache.org/licenses/LICENSE-2.0

11#

12# Unless required by applicable law or agreed to in writing, software

13# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

14# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

15# License for the specific language governing permissions and limitations

16# under the License.

18"""Handles all processes relating to instances (guest vms).

20The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that

21handles RPC calls relating to creating instances. It is responsible for

22building a disk image, launching it via the underlying virtualization driver,

23responding to calls to check its state, attaching persistent storage, and

24terminating it.

26"""

28import base64

29import binascii

30import contextlib

31import copy

32import functools

33import inspect

34import math

35import sys

36import time

37import traceback

38import typing as ty

40from cinderclient import exceptions as cinder_exception

41from cursive import exception as cursive_exception

42import eventlet.event

43from eventlet import greenthread

44import eventlet.semaphore

45import eventlet.timeout

46import futurist

47from keystoneauth1 import exceptions as keystone_exception

48from openstack import exceptions as sdk_exc

49import os_traits

50from oslo_log import log as logging

51import oslo_messaging as messaging

52from oslo_serialization import jsonutils

53from oslo_service import loopingcall

54from oslo_service import periodic_task

55from oslo_utils import excutils

56from oslo_utils import strutils

57from oslo_utils import timeutils

58from oslo_utils import units

60from nova.accelerator import cyborg

61from nova import block_device

62from nova.compute import api as compute

63from nova.compute import build_results

64from nova.compute import claims

65from nova.compute import power_state

66from nova.compute import resource_tracker

67from nova.compute import rpcapi as compute_rpcapi

68from nova.compute import task_states

69from nova.compute import utils as compute_utils

70from nova.compute.utils import wrap_instance_event

71from nova.compute import vm_states

72from nova import conductor

73import nova.conf

74import nova.context

75from nova import exception

76from nova import exception_wrapper

77from nova.i18n import _

78from nova.image import glance

79from nova import manager

80from nova.network import model as network_model

81from nova.network import neutron

82from nova import objects

83from nova.objects import base as obj_base

84from nova.objects import external_event as external_event_obj

85from nova.objects import fields

86from nova.objects import instance as obj_instance

87from nova.objects import migrate_data as migrate_data_obj

88from nova.objects import service as service_obj

89from nova.pci import request as pci_req_module

90from nova.pci import whitelist

91from nova import safe_utils

92from nova.scheduler.client import query

93from nova.scheduler.client import report

94from nova.scheduler import utils as scheduler_utils

95from nova.share import manila

96from nova import utils

97from nova.virt import block_device as driver_block_device

98from nova.virt import configdrive

99from nova.virt import driver

100from nova.virt import event as virtevent

101from nova.virt import hardware

102import nova.virt.node

103from nova.virt import storage_users

104from nova.virt import virtapi

105from nova.volume import cinder

106

107CONF = nova.conf.CONF

108

109LOG = logging.getLogger(__name__)

110

111wrap_exception = functools.partial(

112 exception_wrapper.wrap_exception, service='compute', binary='nova-compute')

113

114

115@contextlib.contextmanager

116def errors_out_migration_ctxt(migration):

117 """Context manager to error out migration on failure."""

118

119 try:

120 yield

121 except Exception:

122 with excutils.save_and_reraise_exception():

123 if migration:

124 # We may have been passed None for our migration if we're

125 # receiving from an older client. The migration will be

126 # errored via the legacy path.

127 migration.status = 'error'

128 try:

129 migration.save()

130 except Exception:

131 LOG.debug(

132 'Error setting migration status for instance %s.',

133 migration.instance_uuid, exc_info=True)

134

135

136@utils.expects_func_args('migration')

137def errors_out_migration(function):

138 """Decorator to error out migration on failure."""

139

140 @functools.wraps(function)

141 def decorated_function(self, context, *args, **kwargs):

142 wrapped_func = safe_utils.get_wrapped_function(function)

143 keyed_args = inspect.getcallargs(wrapped_func, self, context,

144 *args, **kwargs)

145 migration = keyed_args['migration']

146 with errors_out_migration_ctxt(migration):

147 return function(self, context, *args, **kwargs)

148

149 return decorated_function

150

151

152@utils.expects_func_args('instance')

153def reverts_task_state(function):

154 """Decorator to revert task_state on failure."""

155

156 @functools.wraps(function)

157 def decorated_function(self, context, *args, **kwargs):

158 try:

159 return function(self, context, *args, **kwargs)

160 except exception.UnexpectedTaskStateError as e:

161 # Note(maoy): unexpected task state means the current

162 # task is preempted. Do not clear task state in this

163 # case.

164 with excutils.save_and_reraise_exception():

165 LOG.info("Task possibly preempted: %s",

166 e.format_message())

167 except Exception:

168 with excutils.save_and_reraise_exception():

169 wrapped_func = safe_utils.get_wrapped_function(function)

170 keyed_args = inspect.getcallargs(wrapped_func, self, context,

171 *args, **kwargs)

172 # NOTE(mriedem): 'instance' must be in keyed_args because we

173 # have utils.expects_func_args('instance') decorating this

174 # method.

175 instance = keyed_args['instance']

176 original_task_state = instance.task_state

177 try:

178 self._instance_update(context, instance, task_state=None)

179 LOG.info("Successfully reverted task state from %s on "

180 "failure for instance.",

181 original_task_state, instance=instance)

182 except exception.InstanceNotFound:

183 # We might delete an instance that failed to build shortly

184 # after it errored out this is an expected case and we

185 # should not trace on it.

186 pass

187 except Exception as e:

188 LOG.warning("Failed to revert task state for instance. "

189 "Error: %s", e, instance=instance)

190

191 return decorated_function

192

193

194@utils.expects_func_args('instance')

195def wrap_instance_fault(function):

196 """Wraps a method to catch exceptions related to instances.

197

198 This decorator wraps a method to catch any exceptions having to do with

199 an instance that may get thrown. It then logs an instance fault in the db.

200 """

201

202 @functools.wraps(function)

203 def decorated_function(self, context, *args, **kwargs):

204 try:

205 return function(self, context, *args, **kwargs)

206 except exception.InstanceNotFound:

207 raise

208 except Exception as e:

209 # NOTE(gtt): If argument 'instance' is in args rather than kwargs,

210 # we will get a KeyError exception which will cover up the real

211 # exception. So, we update kwargs with the values from args first.

212 # then, we can get 'instance' from kwargs easily.

213 kwargs.update(dict(zip(function.__code__.co_varnames[2:], args)))

214

215 with excutils.save_and_reraise_exception():

216 compute_utils.add_instance_fault_from_exc(context,

217 kwargs['instance'], e, sys.exc_info())

218

219 return decorated_function

220

221

222@utils.expects_func_args('image_id', 'instance')

223def delete_image_on_error(function):

224 """Used for snapshot related method to ensure the image created in

225 compute.api is deleted when an error occurs.

226 """

227

228 @functools.wraps(function)

229 def decorated_function(self, context, image_id, instance,

230 *args, **kwargs):

231 try:

232 return function(self, context, image_id, instance,

233 *args, **kwargs)

234 except Exception:

235 with excutils.save_and_reraise_exception():

236 compute_utils.delete_image(

237 context, instance, self.image_api, image_id,

238 log_exc_info=True)

239

240 return decorated_function

241

242

243# Each collection of events is a dict of eventlet Events keyed by a tuple of

244# event name and associated tag

245_InstanceEvents = ty.Dict[ty.Tuple[str, str], eventlet.event.Event]

246

247

248class InstanceEvents(object):

249 def __init__(self):

250 self._events: ty.Optional[ty.Dict[str, _InstanceEvents]] = {}

251

252 @staticmethod

253 def _lock_name(instance) -> str:

254 return '%s-%s' % (instance.uuid, 'events')

255

256 def prepare_for_instance_event(

257 self,

258 instance: 'objects.Instance',

259 name: str,

260 tag: str,

261 ) -> eventlet.event.Event:

262 """Prepare to receive an event for an instance.

263

264 This will register an event for the given instance that we will

265 wait on later. This should be called before initiating whatever

266 action will trigger the event. The resulting eventlet.event.Event

267 object should be wait()'d on to ensure completion.

268

269 :param instance: the instance for which the event will be generated

270 :param name: the name of the event we're expecting

271 :param tag: the tag associated with the event we're expecting

272 :returns: an event object that should be wait()'d on

273 """

274 @utils.synchronized(self._lock_name(instance))

275 def _create_or_get_event():

276 if self._events is None:

277 # NOTE(danms): We really should have a more specific error

278 # here, but this is what we use for our default error case

279 raise exception.NovaException(

280 'In shutdown, no new events can be scheduled')

281

282 instance_events = self._events.setdefault(instance.uuid, {})

283 return instance_events.setdefault((name, tag),

284 eventlet.event.Event())

285 LOG.debug('Preparing to wait for external event %(name)s-%(tag)s',

286 {'name': name, 'tag': tag}, instance=instance)

287 return _create_or_get_event()

288

289 def pop_instance_event(self, instance, event):

290 """Remove a pending event from the wait list.

291

292 This will remove a pending event from the wait list so that it

293 can be used to signal the waiters to wake up.

294

295 :param instance: the instance for which the event was generated

296 :param event: the nova.objects.external_event.InstanceExternalEvent

297 that describes the event

298 :returns: the eventlet.event.Event object on which the waiters

299 are blocked

300 """

301 no_events_sentinel = object()

302 no_matching_event_sentinel = object()

303

304 @utils.synchronized(self._lock_name(instance))

305 def _pop_event():

306 if self._events is None:

307 LOG.debug('Unexpected attempt to pop events during shutdown',

308 instance=instance)

309 return no_events_sentinel

310 events = self._events.get(instance.uuid)

311 if not events: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 return no_events_sentinel

313 _event = events.pop((event.name, event.tag), None)

314 if not events: 314 ↛ 316line 314 didn't jump to line 316 because the condition on line 314 was always true

315 del self._events[instance.uuid]

316 if _event is None: 316 ↛ 317line 316 didn't jump to line 317 because the condition on line 316 was never true

317 return no_matching_event_sentinel

318 return _event

319

320 result = _pop_event()

321 if result is no_events_sentinel:

322 LOG.debug('No waiting events found dispatching %(event)s',

323 {'event': event.key},

324 instance=instance)

325 return None

326 elif result is no_matching_event_sentinel: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 LOG.debug(

328 'No event matching %(event)s in %(events)s',

329 {

330 'event': event.key,

331 # mypy can't identify the none check in _pop_event

332 'events': self._events.get( # type: ignore

333 instance.uuid, {}).keys(),

334 },

335 instance=instance,

336 )

337 return None

338 else:

339 return result

340

341 def clear_events_for_instance(self, instance):

342 """Remove all pending events for an instance.

343

344 This will remove all events currently pending for an instance

345 and return them (indexed by event name).

346

347 :param instance: the instance for which events should be purged

348 :returns: a dictionary of {event_name: eventlet.event.Event}

349 """

350 @utils.synchronized(self._lock_name(instance))

351 def _clear_events():

352 if self._events is None:

353 LOG.debug('Unexpected attempt to clear events during shutdown',

354 instance=instance)

355 return dict()

356 # NOTE(danms): We have historically returned the raw internal

357 # format here, which is {event.key: [events, ...])} so just

358 # trivially convert it here.

359 return {'%s-%s' % k: e

360 for k, e in self._events.pop(instance.uuid, {}).items()}

361 return _clear_events()

362

363 def cancel_all_events(self):

364 if self._events is None:

365 LOG.debug('Unexpected attempt to cancel events during shutdown.')

366 return

367 our_events = self._events

368 # NOTE(danms): Block new events

369 self._events = None

370

371 for instance_uuid, events in our_events.items():

372 for (name, tag), eventlet_event in events.items():

373 LOG.debug('Canceling in-flight event %(name)s-%(tag)s for '

374 'instance %(instance_uuid)s',

375 {'name': name,

376 'tag': tag,

377 'instance_uuid': instance_uuid})

378 event = objects.InstanceExternalEvent(

379 instance_uuid=instance_uuid,

380 name=name, status='failed',

381 tag=tag, data={})

382 eventlet_event.send(event)

383

384

385class ComputeVirtAPI(virtapi.VirtAPI):

386 def __init__(self, compute):

387 super(ComputeVirtAPI, self).__init__()

388 self._compute = compute

389 self.reportclient = compute.reportclient

390

391 class ExitEarly(Exception):

392 def __init__(self, events):

393 super(Exception, self).__init__()

394 self.events = events

395

396 self._exit_early_exc = ExitEarly

397

398 def exit_wait_early(self, events):

399 """Exit a wait_for_instance_event() immediately and avoid

400 waiting for some events.

401

402 :param: events: A list of (name, tag) tuples for events that we should

403 skip waiting for during a wait_for_instance_event().

404 """

405 raise self._exit_early_exc(events=events)

406

407 def _default_error_callback(self, event_name, instance):

408 raise exception.NovaException(_('Instance event failed'))

409

410 class _InstanceEvent:

411 EXPECTED = "expected"

412 WAITING = "waiting"

413 RECEIVED = "received"

414 RECEIVED_EARLY = "received early"

415 TIMED_OUT = "timed out"

416 RECEIVED_NOT_PROCESSED = "received but not processed"

417

418 def __init__(self, name: str, event: eventlet.event.Event) -> None:

419 self.name = name

420 self.event = event

421 self.status = self.EXPECTED

422 self.wait_time = None

423

424 def mark_as_received_early(self) -> None:

425 self.status = self.RECEIVED_EARLY

426

427 def is_received_early(self) -> bool:

428 return self.status == self.RECEIVED_EARLY

429

430 def _update_status_no_wait(self):

431 if self.status == self.EXPECTED and self.event.ready():

432 self.status = self.RECEIVED_NOT_PROCESSED

433

434 def wait(self) -> 'objects.InstanceExternalEvent':

435 self.status = self.WAITING

436 try:

437 with timeutils.StopWatch() as sw:

438 instance_event = self.event.wait()

439 except eventlet.timeout.Timeout:

440 self.status = self.TIMED_OUT

441 self.wait_time = sw.elapsed()

442

443 raise

444

445 self.status = self.RECEIVED

446 self.wait_time = sw.elapsed()

447 return instance_event

448

449 def __str__(self) -> str:

450 self._update_status_no_wait()

451 if self.status == self.EXPECTED:

452 return f"{self.name}: expected but not received"

453 if self.status == self.RECEIVED:

454 return (

455 f"{self.name}: received after waiting "

456 f"{self.wait_time:.2f} seconds")

457 if self.status == self.TIMED_OUT:

458 return (

459 f"{self.name}: timed out after "

460 f"{self.wait_time:.2f} seconds")

461 return f"{self.name}: {self.status}"

462

463 @staticmethod

464 def _wait_for_instance_events(

465 instance: 'objects.Instance',

466 events: dict,

467 error_callback: ty.Callable,

468 ) -> None:

469 for event_name, event in events.items():

470 if event.is_received_early():

471 continue

472 else:

473 actual_event = event.wait()

474 if actual_event.status == 'completed':

475 continue

476 # If we get here, we have an event that was not completed,

477 # nor skipped via exit_wait_early(). Decide whether to

478 # keep waiting by calling the error_callback() hook.

479 decision = error_callback(event_name, instance)

480 if decision is False: 480 ↛ 481line 480 didn't jump to line 481 because the condition on line 480 was never true

481 break

482

483 @contextlib.contextmanager

484 def wait_for_instance_event(self, instance, event_names, deadline=300,

485 error_callback=None):

486 """Plan to wait for some events, run some code, then wait.

487

488 This context manager will first create plans to wait for the

489 provided event_names, yield, and then wait for all the scheduled

490 events to complete.

491

492 Note that this uses an eventlet.timeout.Timeout to bound the

493 operation, so callers should be prepared to catch that

494 failure and handle that situation appropriately.

495

496 If the event is not received by the specified timeout deadline,

497 eventlet.timeout.Timeout is raised.

498

499 If the event is received but did not have a 'completed'

500 status, a NovaException is raised. If an error_callback is

501 provided, instead of raising an exception as detailed above

502 for the failure case, the callback will be called with the

503 event_name and instance, and can return True to continue

504 waiting for the rest of the events, False to stop processing,

505 or raise an exception which will bubble up to the waiter.

506

507 If the inner code wishes to abort waiting for one or more

508 events because it knows some state to be finished or condition

509 to be satisfied, it can use VirtAPI.exit_wait_early() with a

510 list of event (name,tag) items to avoid waiting for those

511 events upon context exit. Note that exit_wait_early() exits

512 the context immediately and should be used to signal that all

513 work has been completed and provide the unified list of events

514 that need not be waited for. Waiting for the remaining events

515 will begin immediately upon early exit as if the context was

516 exited normally.

517

518 :param instance: The instance for which an event is expected

519 :param event_names: A list of event names. Each element is a

520 tuple of strings to indicate (name, tag),

521 where name is required, but tag may be None.

522 :param deadline: Maximum number of seconds we should wait for all

523 of the specified events to arrive.

524 :param error_callback: A function to be called if an event arrives

525

526 """

527

528 if error_callback is None:

529 error_callback = self._default_error_callback

530 events = {}

531 for event_name in event_names:

532 name, tag = event_name

533 event_name = objects.InstanceExternalEvent.make_key(name, tag)

534 try:

535 event = (

536 self._compute.instance_events.prepare_for_instance_event(

537 instance, name, tag))

538 events[event_name] = self._InstanceEvent(event_name, event)

539 except exception.NovaException:

540 error_callback(event_name, instance)

541 # NOTE(danms): Don't wait for any of the events. They

542 # should all be canceled and fired immediately below,

543 # but don't stick around if not.

544 deadline = 0

545 try:

546 yield

547 except self._exit_early_exc as e:

548 early_events = set([objects.InstanceExternalEvent.make_key(n, t)

549 for n, t in e.events])

550

551 # If there are expected events that received early, mark them,

552 # so they won't be waited for later

553 for early_event_name in early_events:

554 if early_event_name in events: 554 ↛ 553line 554 didn't jump to line 553 because the condition on line 554 was always true

555 events[early_event_name].mark_as_received_early()

556

557 sw = timeutils.StopWatch()

558 sw.start()

559 try:

560 with eventlet.timeout.Timeout(deadline):

561 self._wait_for_instance_events(

562 instance, events, error_callback)

563 except eventlet.timeout.Timeout:

564 LOG.warning(

565 'Timeout waiting for %(events)s for instance with '

566 'vm_state %(vm_state)s and task_state %(task_state)s. '

567 'Event states are: %(event_states)s',

568 {

569 'events': list(events.keys()),

570 'vm_state': instance.vm_state,

571 'task_state': instance.task_state,

572 'event_states':

573 ', '.join([str(event) for event in events.values()]),

574 },

575 instance=instance)

576

577 raise

578

579 LOG.debug('Instance event wait completed in %i seconds for %s',

580 sw.elapsed(),

581 ','.join(x[0] for x in event_names),

582 instance=instance)

583

584 def update_compute_provider_status(self, context, rp_uuid, enabled):

585 """Used to add/remove the COMPUTE_STATUS_DISABLED trait on the provider

586

587 :param context: nova auth RequestContext

588 :param rp_uuid: UUID of a compute node resource provider in Placement

589 :param enabled: True if the node is enabled in which case the trait

590 would be removed, False if the node is disabled in which case

591 the trait would be added.

592 :raises: ResourceProviderTraitRetrievalFailed

593 :raises: ResourceProviderUpdateConflict

594 :raises: ResourceProviderUpdateFailed

595 :raises: TraitRetrievalFailed

596 :raises: keystoneauth1.exceptions.ClientException

597 """

598 trait_name = os_traits.COMPUTE_STATUS_DISABLED

599 # Get the current traits (and generation) for the provider.

600 # TODO(mriedem): Leverage the ProviderTree cache in get_provider_traits

601 trait_info = self.reportclient.get_provider_traits(context, rp_uuid)

602 # If the host is enabled, remove the trait (if set), else add

603 # the trait if it doesn't already exist.

604 original_traits = trait_info.traits

605 new_traits = None

606 if enabled and trait_name in original_traits:

607 new_traits = original_traits - {trait_name}

608 LOG.debug('Removing trait %s from compute node resource '

609 'provider %s in placement.', trait_name, rp_uuid)

610 elif not enabled and trait_name not in original_traits:

611 new_traits = original_traits | {trait_name}

612 LOG.debug('Adding trait %s to compute node resource '

613 'provider %s in placement.', trait_name, rp_uuid)

614

615 if new_traits is not None:

616 self.reportclient.set_traits_for_provider(

617 context, rp_uuid, new_traits, generation=trait_info.generation)

618

619

620class ComputeManager(manager.Manager):

621 """Manages the running instances from creation to destruction."""

622

623 target = messaging.Target(version='6.4')

624

625 def __init__(self, compute_driver=None, *args, **kwargs):

626 """Load configuration options and connect to the hypervisor."""

627 # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all

628 # using the same instance of SchedulerReportClient which has the

629 # ProviderTree cache for this compute service.

630 # NOTE(danms): We do not use the global placement client

631 # singleton here, because the above-mentioned stack of objects

632 # maintain local state in the client. Thus, keeping our own

633 # private object for that stack avoids any potential conflict

634 # with other users in our process outside of the above.

635 self.reportclient = report.SchedulerReportClient()

636 self.virtapi = ComputeVirtAPI(self)

637 self.network_api = neutron.API()

638 self.volume_api = cinder.API()

639 self.manila_api = manila.API()

640 self.image_api = glance.API()

641 self._last_bw_usage_poll = 0.0

642 self.compute_api = compute.API()

643 self.compute_rpcapi = compute_rpcapi.ComputeAPI()

644 self.compute_task_api = conductor.ComputeTaskAPI()

645 self.query_client = query.SchedulerQueryClient()

646 self.instance_events = InstanceEvents()

647 self._sync_power_pool = eventlet.GreenPool(

648 size=CONF.sync_power_state_pool_size)

649 self._syncs_in_progress = {}

650 self.send_instance_updates = (

651 CONF.filter_scheduler.track_instance_changes)

652 if CONF.max_concurrent_builds != 0:

653 self._build_semaphore = eventlet.semaphore.Semaphore(

654 CONF.max_concurrent_builds)

655 else:

656 self._build_semaphore = compute_utils.UnlimitedSemaphore()

657 if CONF.max_concurrent_snapshots > 0:

658 self._snapshot_semaphore = eventlet.semaphore.Semaphore(

659 CONF.max_concurrent_snapshots)

660 else:

661 self._snapshot_semaphore = compute_utils.UnlimitedSemaphore()

662 if CONF.max_concurrent_live_migrations > 0:

663 self._live_migration_executor = futurist.GreenThreadPoolExecutor(

664 max_workers=CONF.max_concurrent_live_migrations)

665 else:

666 # CONF.max_concurrent_live_migrations is 0 (unlimited)

667 self._live_migration_executor = futurist.GreenThreadPoolExecutor()

668 # This is a dict, keyed by instance uuid, to a two-item tuple of

669 # migration object and Future for the queued live migration.

670 self._waiting_live_migrations = {}

671

672 super(ComputeManager, self).__init__(service_name="compute",

673 *args, **kwargs)

674

675 # TODO(sbauza): Remove this call once we delete the V5Proxy class

676 self.additional_endpoints.append(_ComputeV5Proxy(self))

677

678 # NOTE(russellb) Load the driver last. It may call back into the

679 # compute manager via the virtapi, so we want it to be fully

680 # initialized before that happens.

681 self.service_ref = None

682 self.driver = driver.load_compute_driver(self.virtapi, compute_driver)

683 self.rt = resource_tracker.ResourceTracker(

684 self.host, self.driver, reportclient=self.reportclient)

685

686 def reset(self):

687 LOG.info('Reloading compute RPC API')

688 compute_rpcapi.reset_globals()

689 self.compute_rpcapi = compute_rpcapi.ComputeAPI()

690 self.reportclient.clear_provider_cache()

691

692 def _update_resource_tracker(self, context, instance):

693 """Let the resource tracker know that an instance has changed state."""

694

695 if instance.host == self.host:

696 self.rt.update_usage(context, instance, instance.node)

697

698 def _instance_update(self, context, instance, **kwargs):

699 """Update an instance in the database using kwargs as value."""

700

701 for k, v in kwargs.items():

702 setattr(instance, k, v)

703 instance.save()

704 self._update_resource_tracker(context, instance)

705

706 def _nil_out_instance_obj_host_and_node(self, instance):

707 # NOTE(jwcroppe): We don't do instance.save() here for performance

708 # reasons; a call to this is expected to be immediately followed by

709 # another call that does instance.save(), thus avoiding two writes

710 # to the database layer.

711 instance.host = None

712 instance.node = None

713 instance.compute_id = None

714 # ResourceTracker._set_instance_host_and_node also sets launched_on

715 # to the same value as host and is really only ever used by legacy

716 # nova-network code, but we should also null it out to avoid confusion

717 # if there is an instance in the database with no host set but

718 # launched_on is set. Note that we do not care about using launched_on

719 # as some kind of debug helper if diagnosing a build failure, that is

720 # what instance action events are for.

721 instance.launched_on = None

722 # If the instance is not on a host, it's not in an aggregate and

723 # therefore is not in an availability zone.

724 instance.availability_zone = None

725

726 def _set_instance_obj_error_state(self, instance, clean_task_state=False):

727 try:

728 instance.vm_state = vm_states.ERROR

729 if clean_task_state:

730 instance.task_state = None

731 instance.save()

732 except exception.InstanceNotFound:

733 LOG.debug('Instance has been destroyed from under us while '

734 'trying to set it to ERROR', instance=instance)

735

736 def _get_instances_on_driver(self, context, filters=None):

737 """Return a list of instance records for the instances found

738 on the hypervisor which satisfy the specified filters. If filters=None

739 return a list of instance records for all the instances found on the

740 hypervisor.

741 """

742 if not filters:

743 filters = {}

744 try:

745 driver_uuids = self.driver.list_instance_uuids()

746 if len(driver_uuids) == 0:

747 # Short circuit, don't waste a DB call

748 return objects.InstanceList()

749 filters['uuid'] = driver_uuids

750 local_instances = objects.InstanceList.get_by_filters(

751 context, filters, use_slave=True)

752 return local_instances

753 except NotImplementedError:

754 pass

755

756 # The driver doesn't support uuids listing, so we'll have

757 # to brute force.

758 driver_instances = self.driver.list_instances()

759 # NOTE(mjozefcz): In this case we need to apply host filter.

760 # Without this all instance data would be fetched from db.

761 filters['host'] = self.host

762 instances = objects.InstanceList.get_by_filters(context, filters,

763 use_slave=True)

764 name_map = {instance.name: instance for instance in instances}

765 local_instances = []

766 for driver_instance in driver_instances:

767 instance = name_map.get(driver_instance)

768 if not instance: 768 ↛ 769line 768 didn't jump to line 769 because the condition on line 768 was never true

769 continue

770 local_instances.append(instance)

771 return local_instances

772

773 def _destroy_evacuated_instances(self, context, node_cache):

774 """Destroys evacuated instances.

775

776 While nova-compute was down, the instances running on it could be

777 evacuated to another host. This method looks for evacuation migration

778 records where this is the source host and which were either started

779 (accepted), in-progress (pre-migrating) or migrated (done). From those

780 migration records, local instances reported by the hypervisor are

781 compared to the instances for the migration records and those local

782 guests are destroyed, along with instance allocation records in

783 Placement for this node.

784 Then allocations are removed from Placement for every instance that is

785 evacuated from this host regardless if the instance is reported by the

786 hypervisor or not.

787

788 :param context: The request context

789 :param node_cache: A dict of ComputeNode objects keyed by the UUID of

790 the compute node

791 :return: A dict keyed by instance uuid mapped to Migration objects

792 for instances that were migrated away from this host

793 """

794 filters = {

795 'source_compute': self.host,

796 # NOTE(mriedem): Migration records that have been accepted are

797 # included in case the source node comes back up while instances

798 # are being evacuated to another host. We don't want the same

799 # instance being reported from multiple hosts.

800 # NOTE(lyarwood): pre-migrating is also included here as the

801 # source compute can come back online shortly after the RT

802 # claims on the destination that in-turn moves the migration to

803 # pre-migrating. If the evacuate fails on the destination host,

804 # the user can rebuild the instance (in ERROR state) on the source

805 # host.

806 'status': ['accepted', 'pre-migrating', 'done'],

807 'migration_type': fields.MigrationType.EVACUATION,

808 }

809 with utils.temporary_mutation(context, read_deleted='yes'):

810 evacuations = objects.MigrationList.get_by_filters(context,

811 filters)

812 if not evacuations:

813 return {}

814 evacuations = {mig.instance_uuid: mig for mig in evacuations}

815

816 # TODO(mriedem): We could optimize by pre-loading the joined fields

817 # we know we'll use, like info_cache and flavor.

818 local_instances = self._get_instances_on_driver(context)

819 evacuated_local_instances = {inst.uuid: inst

820 for inst in local_instances

821 if inst.uuid in evacuations}

822

823 for instance in evacuated_local_instances.values():

824 LOG.info('Destroying instance as it has been evacuated from '

825 'this host but still exists in the hypervisor',

826 instance=instance)

827 try:

828 network_info = self.network_api.get_instance_nw_info(

829 context, instance)

830 bdi = self._get_instance_block_device_info(context,

831 instance)

832 evac = evacuations[instance.uuid]

833 destroy_disks = not (self._is_instance_storage_shared(

834 context, instance, host=evac.dest_compute))

835 except exception.InstanceNotFound:

836 network_info = network_model.NetworkInfo()

837 bdi = {}

838 LOG.info('Instance has been marked deleted already, '

839 'removing it from the hypervisor.',

840 instance=instance)

841 # always destroy disks if the instance was deleted

842 destroy_disks = True

843 self.driver.destroy(context, instance,

844 network_info,

845 bdi, destroy_disks)

846

847 hostname_to_cn_uuid = {

848 cn.hypervisor_hostname: cn.uuid

849 for cn in node_cache.values()}

850

851 for instance_uuid, migration in evacuations.items():

852 try:

853 if instance_uuid in evacuated_local_instances:

854 # Avoid the db call if we already have the instance loaded

855 # above

856 instance = evacuated_local_instances[instance_uuid]

857 else:

858 instance = objects.Instance.get_by_uuid(

859 context, instance_uuid)

860 except exception.InstanceNotFound:

861 # The instance already deleted so we expect that every

862 # allocation of that instance has already been cleaned up

863 continue

864

865 LOG.info('Cleaning up allocations of the instance as it has been '

866 'evacuated from this host',

867 instance=instance)

868 if migration.source_node not in hostname_to_cn_uuid:

869 LOG.error("Failed to clean allocation of evacuated "

870 "instance as the source node %s is not found",

871 migration.source_node, instance=instance)

872 continue

873 cn_uuid = hostname_to_cn_uuid[migration.source_node]

874

875 # If the instance was deleted in the interim, assume its

876 # allocations were properly cleaned up (either by its hosting

877 # compute service or the API).

878 if (not instance.deleted and 878 ↛ 882line 878 didn't jump to line 882 because the condition on line 878 was never true

879 not self.reportclient.

880 remove_provider_tree_from_instance_allocation(

881 context, instance.uuid, cn_uuid)):

882 LOG.error("Failed to clean allocation of evacuated instance "

883 "on the source node %s",

884 cn_uuid, instance=instance)

885

886 migration.status = 'completed'

887 migration.save()

888 return evacuations

889

890 def _is_instance_storage_shared(self, context, instance, host=None):

891 shared_storage = True

892 data = None

893 try:

894 data = self.driver.check_instance_shared_storage_local(context,

895 instance)

896 if data: 896 ↛ 910line 896 didn't jump to line 910 because the condition on line 896 was always true

897 shared_storage = (self.compute_rpcapi.

898 check_instance_shared_storage(context,

899 data, instance=instance, host=host))

900 except NotImplementedError:

901 LOG.debug('Hypervisor driver does not support '

902 'instance shared storage check, '

903 'assuming it\'s not on shared storage',

904 instance=instance)

905 shared_storage = False

906 except Exception:

907 LOG.exception('Failed to check if instance shared',

908 instance=instance)

909 finally:

910 if data:

911 self.driver.check_instance_shared_storage_cleanup(context,

912 data)

913 return shared_storage

914

915 def _complete_partial_deletion(self, context, instance):

916 """Complete deletion for instances in DELETED status but not marked as

917 deleted in the DB

918 """

919 instance.destroy()

920 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

921 context, instance.uuid)

922 self._complete_deletion(context,

923 instance)

924 self._notify_about_instance_usage(context, instance, "delete.end")

925 compute_utils.notify_about_instance_action(context, instance,

926 self.host, action=fields.NotificationAction.DELETE,

927 phase=fields.NotificationPhase.END, bdms=bdms)

928

929 def _complete_deletion(self, context, instance):

930 self._update_resource_tracker(context, instance)

931

932 # If we're configured to do deferred deletes, don't force deletion of

933 # allocations if there's a conflict.

934 force = False if CONF.reclaim_instance_interval > 0 else True

935

936 self.reportclient.delete_allocation_for_instance(context,

937 instance.uuid,

938 force=force)

939

940 self._clean_instance_console_tokens(context, instance)

941 self._delete_scheduler_instance_info(context, instance.uuid)

942

943 def _validate_pinning_configuration(self, instances):

944 if not self.driver.capabilities.get('supports_pcpus', False):

945 return

946

947 for instance in instances:

948 # ignore deleted instances

949 if instance.deleted:

950 continue

951

952 # if this is an unpinned instance and the host only has

953 # 'cpu_dedicated_set' configured, we need to tell the operator to

954 # correct their configuration

955 if not instance.numa_topology or (

956 instance.numa_topology.cpu_policy in (

957 None, fields.CPUAllocationPolicy.SHARED

958 )

959 ):

960 # we don't need to check 'vcpu_pin_set' since it can't coexist

961 # alongside 'cpu_dedicated_set'

962 if (CONF.compute.cpu_dedicated_set and

963 not CONF.compute.cpu_shared_set):

964 msg = _("This host has unpinned instances but has no CPUs "

965 "set aside for this purpose; configure '[compute] "

966 "cpu_shared_set' instead of, or in addition to, "

967 "'[compute] cpu_dedicated_set'")

968 raise exception.InvalidConfiguration(msg)

969

970 continue

971

972 # ditto for pinned instances if only 'cpu_shared_set' is configured

973 if (CONF.compute.cpu_shared_set and

974 not CONF.compute.cpu_dedicated_set and

975 not CONF.vcpu_pin_set):

976 msg = _("This host has pinned instances but has no CPUs "

977 "set aside for this purpose; configure '[compute] "

978 "cpu_dedicated_set' instead of, or in addition to, "

979 "'[compute] cpu_shared_set'.")

980 raise exception.InvalidConfiguration(msg)

981

982 # if this is a mixed instance with both pinned and unpinned CPUs,

983 # the host must have both 'cpu_dedicated_set' and 'cpu_shared_set'

984 # configured. check if 'cpu_shared_set' is set.

985 if (instance.numa_topology.cpu_policy ==

986 fields.CPUAllocationPolicy.MIXED and

987 not CONF.compute.cpu_shared_set):

988 msg = _("This host has mixed instance requesting both pinned "

989 "and unpinned CPUs but hasn't set aside unpinned CPUs "

990 "for this purpose; Configure "

991 "'[compute] cpu_shared_set'.")

992 raise exception.InvalidConfiguration(msg)

993

994 # for mixed instance check if 'cpu_dedicated_set' is set.

995 if (instance.numa_topology.cpu_policy == 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was never true

996 fields.CPUAllocationPolicy.MIXED and

997 not CONF.compute.cpu_dedicated_set):

998 msg = _("This host has mixed instance requesting both pinned "

999 "and unpinned CPUs but hasn't set aside pinned CPUs "

1000 "for this purpose; Configure "

1001 "'[compute] cpu_dedicated_set'")

1002 raise exception.InvalidConfiguration(msg)

1003

1004 # also check to make sure the operator hasn't accidentally

1005 # dropped some cores that instances are currently using

1006 available_dedicated_cpus = (hardware.get_vcpu_pin_set() or

1007 hardware.get_cpu_dedicated_set())

1008 pinned_cpus = instance.numa_topology.cpu_pinning

1009 if available_dedicated_cpus and (

1010 pinned_cpus - available_dedicated_cpus):

1011 # we can't raise an exception because of bug #1289064,

1012 # which meant we didn't recalculate CPU pinning information

1013 # when we live migrated a pinned instance

1014 LOG.warning(

1015 "Instance is pinned to host CPUs %(cpus)s "

1016 "but one or more of these CPUs are not included in "

1017 "either '[compute] cpu_dedicated_set' or "

1018 "'vcpu_pin_set'; you should update these "

1019 "configuration options to include the missing CPUs "

1020 "or rebuild or cold migrate this instance.",

1021 {'cpus': list(pinned_cpus)},

1022 instance=instance)

1023

1024 def _validate_vtpm_configuration(self, instances):

1025 if self.driver.capabilities.get('supports_vtpm', False):

1026 return

1027

1028 for instance in instances:

1029 if instance.deleted: 1029 ↛ 1030line 1029 didn't jump to line 1030 because the condition on line 1029 was never true

1030 continue

1031

1032 # NOTE(stephenfin): We don't have an attribute on the instance to

1033 # check for this, so we need to inspect the flavor/image metadata

1034 if hardware.get_vtpm_constraint(

1035 instance.flavor, instance.image_meta,

1036 ):

1037 msg = _(

1038 'This host has instances with the vTPM feature enabled, '

1039 'but the host is not correctly configured; enable '

1040 'vTPM support.'

1041 )

1042 raise exception.InvalidConfiguration(msg)

1043

1044 def _reset_live_migration(self, context, instance):

1045 migration = None

1046 try:

1047 migration = objects.Migration.get_by_instance_and_status(

1048 context, instance.uuid, 'running')

1049 if migration: 1049 ↛ 1055line 1049 didn't jump to line 1055 because the condition on line 1049 was always true

1050 self.live_migration_abort(context, instance, migration.id)

1051 except Exception:

1052 LOG.exception('Failed to abort live-migration',

1053 instance=instance)

1054 finally:

1055 if migration: 1055 ↛ 1057line 1055 didn't jump to line 1057 because the condition on line 1055 was always true

1056 self._set_migration_status(migration, 'error')

1057 LOG.info('Instance found in migrating state during '

1058 'startup. Resetting task_state',

1059 instance=instance)

1060 instance.task_state = None

1061 instance.save(expected_task_state=[task_states.MIGRATING])

1062

1063 def _init_instance(self, context, instance):

1064 """Initialize this instance during service init."""

1065

1066 # NOTE(danms): If the instance appears to not be owned by this

1067 # host, it may have been evacuated away, but skipped by the

1068 # evacuation cleanup code due to configuration. Thus, if that

1069 # is a possibility, don't touch the instance in any way, but

1070 # log the concern. This will help avoid potential issues on

1071 # startup due to misconfiguration.

1072 if instance.host != self.host:

1073 LOG.warning('Instance %(uuid)s appears to not be owned '

1074 'by this host, but by %(host)s. Startup '

1075 'processing is being skipped.',

1076 {'uuid': instance.uuid,

1077 'host': instance.host})

1078 return

1079

1080 # Instances that are shut down, or in an error state can not be

1081 # initialized and are not attempted to be recovered. The exception

1082 # to this are instances that are in RESIZE_MIGRATING or DELETING,

1083 # which are dealt with further down.

1084 if (instance.vm_state == vm_states.SOFT_DELETED or 1084 ↛ 1088line 1084 didn't jump to line 1088 because the condition on line 1084 was never true

1085 (instance.vm_state == vm_states.ERROR and

1086 instance.task_state not in

1087 (task_states.RESIZE_MIGRATING, task_states.DELETING))):

1088 LOG.debug("Instance is in %s state.",

1089 instance.vm_state, instance=instance)

1090 return

1091

1092 if instance.vm_state == vm_states.DELETED:

1093 try:

1094 self._complete_partial_deletion(context, instance)

1095 except Exception:

1096 # we don't want that an exception blocks the init_host

1097 LOG.exception('Failed to complete a deletion',

1098 instance=instance)

1099 return

1100

1101 if (instance.vm_state == vm_states.BUILDING or

1102 instance.task_state in [task_states.SCHEDULING,

1103 task_states.BLOCK_DEVICE_MAPPING,

1104 task_states.NETWORKING,

1105 task_states.SPAWNING]):

1106 # NOTE(dave-mcnally) compute stopped before instance was fully

1107 # spawned so set to ERROR state. This is safe to do as the state

1108 # may be set by the api but the host is not so if we get here the

1109 # instance has already been scheduled to this particular host.

1110 LOG.debug("Instance failed to spawn correctly, "

1111 "setting to ERROR state", instance=instance)

1112 self._set_instance_obj_error_state(instance, clean_task_state=True)

1113 return

1114

1115 if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and

1116 instance.task_state in [task_states.REBUILDING,

1117 task_states.REBUILD_BLOCK_DEVICE_MAPPING,

1118 task_states.REBUILD_SPAWNING]):

1119 # NOTE(jichenjc) compute stopped before instance was fully

1120 # spawned so set to ERROR state. This is consistent to BUILD

1121 LOG.debug("Instance failed to rebuild correctly, "

1122 "setting to ERROR state", instance=instance)

1123 self._set_instance_obj_error_state(instance, clean_task_state=True)

1124 return

1125

1126 if (instance.vm_state != vm_states.ERROR and

1127 instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING,

1128 task_states.IMAGE_PENDING_UPLOAD,

1129 task_states.IMAGE_UPLOADING,

1130 task_states.IMAGE_SNAPSHOT]):

1131 LOG.debug("Instance in transitional state %s at start-up "

1132 "clearing task state",

1133 instance.task_state, instance=instance)

1134 instance.task_state = None

1135 instance.save()

1136

1137 if (instance.vm_state != vm_states.ERROR and

1138 instance.task_state in [task_states.RESIZE_PREP]):

1139 LOG.debug("Instance in transitional state %s at start-up "

1140 "clearing task state",

1141 instance['task_state'], instance=instance)

1142 instance.task_state = None

1143 instance.save()

1144

1145 if instance.task_state == task_states.DELETING:

1146 try:

1147 LOG.info('Service started deleting the instance during '

1148 'the previous run, but did not finish. Restarting'

1149 ' the deletion now.', instance=instance)

1150 instance.obj_load_attr('metadata')

1151 instance.obj_load_attr('system_metadata')

1152 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

1153 context, instance.uuid)

1154 self._delete_instance(context, instance, bdms)

1155 except Exception:

1156 # we don't want that an exception blocks the init_host

1157 LOG.exception('Failed to complete a deletion',

1158 instance=instance)

1159 self._set_instance_obj_error_state(instance)

1160 return

1161

1162 current_power_state = self._get_power_state(instance)

1163 try_reboot, reboot_type = self._retry_reboot(

1164 instance, current_power_state)

1165

1166 # NOTE(amorin)

1167 # If the instance is in power_state_SHUTDOWN, we will try_reboot

1168 if try_reboot:

1169 LOG.debug("Instance in transitional state (%(task_state)s) at "

1170 "start-up and power state is (%(power_state)s), "

1171 "triggering reboot",

1172 {'task_state': instance.task_state,

1173 'power_state': current_power_state},

1174 instance=instance)

1175

1176 # NOTE(mikal): if the instance was doing a soft reboot that got as

1177 # far as shutting down the instance but not as far as starting it

1178 # again, then we've just become a hard reboot. That means the

1179 # task state for the instance needs to change so that we're in one

1180 # of the expected task states for a hard reboot.

1181 if (instance.task_state in task_states.soft_reboot_states and

1182 reboot_type == 'HARD'):

1183 instance.task_state = task_states.REBOOT_PENDING_HARD

1184 instance.save()

1185

1186 self.reboot_instance(context, instance, block_device_info=None,

1187 reboot_type=reboot_type)

1188 return

1189

1190 # NOTE(plestang): an instance might be in power_state.RUNNING with a

1191 # transient state when a host is brutally shutdown or rebooted while a

1192 # reboot/pause/unpause is scheduled on client side

1193 elif (current_power_state == power_state.RUNNING and

1194 instance.task_state in [task_states.REBOOT_STARTED,

1195 task_states.REBOOT_STARTED_HARD,

1196 task_states.REBOOTING_HARD,

1197 task_states.REBOOTING,

1198 task_states.PAUSING,

1199 task_states.UNPAUSING]):

1200 LOG.warning("Instance in transitional state "

1201 "(%(task_state)s) at start-up and power state "

1202 "is (%(power_state)s), clearing task state",

1203 {'task_state': instance.task_state,

1204 'power_state': current_power_state},

1205 instance=instance)

1206 instance.task_state = None

1207 instance.vm_state = vm_states.ACTIVE

1208 instance.save()

1209 elif (current_power_state == power_state.PAUSED and

1210 instance.task_state == task_states.UNPAUSING):

1211 LOG.warning("Instance in transitional state "

1212 "(%(task_state)s) at start-up and power state "

1213 "is (%(power_state)s), clearing task state "

1214 "and unpausing the instance",

1215 {'task_state': instance.task_state,

1216 'power_state': current_power_state},

1217 instance=instance)

1218 try:

1219 self.unpause_instance(context, instance)

1220 except NotImplementedError:

1221 # Some virt driver didn't support pause and unpause

1222 pass

1223 except Exception:

1224 LOG.exception('Failed to unpause instance', instance=instance)

1225 return

1226

1227 if instance.task_state == task_states.POWERING_OFF:

1228 try:

1229 LOG.debug("Instance in transitional state %s at start-up "

1230 "retrying stop request",

1231 instance.task_state, instance=instance)

1232 self.stop_instance(context, instance, True)

1233 except Exception:

1234 # we don't want that an exception blocks the init_host

1235 LOG.exception('Failed to stop instance', instance=instance)

1236 return

1237

1238 if instance.task_state == task_states.POWERING_ON:

1239 try:

1240 LOG.debug("Instance in transitional state %s at start-up "

1241 "retrying start request",

1242 instance.task_state, instance=instance)

1243 self.start_instance(context, instance)

1244 except Exception:

1245 # we don't want that an exception blocks the init_host

1246 LOG.exception('Failed to start instance', instance=instance)

1247 return

1248

1249 net_info = instance.get_network_info()

1250 try:

1251 self.driver.plug_vifs(instance, net_info)

1252 except NotImplementedError as e:

1253 LOG.debug(e, instance=instance)

1254 except exception.VirtualInterfacePlugException:

1255 # NOTE(mriedem): If we get here, it could be because the vif_type

1256 # in the cache is "binding_failed" or "unbound".

1257 # The periodic task _heal_instance_info_cache checks for this

1258 # condition. It should fix this by binding the ports again when

1259 # it gets to this instance.

1260 LOG.exception('Virtual interface plugging failed for instance. '

1261 'The port binding:host_id may need to be manually '

1262 'updated.', instance=instance)

1263 self._set_instance_obj_error_state(instance)

1264 return

1265 except exception.PciDeviceNotFoundById:

1266 # This is bug 1981813 where the bound port vnic_type has changed

1267 # from direct to macvtap. Nova does not support that and it

1268 # already printed an ERROR when the change is detected during

1269 # _heal_instance_info_cache. Now we print an ERROR again and skip

1270 # plugging the vifs but let the service startup continue to init

1271 # the other instances

1272 LOG.exception(

1273 'Virtual interface plugging failed for instance. Probably the '

1274 'vnic_type of the bound port has been changed. Nova does not '

1275 'support such change.',

1276 instance=instance

1277 )

1278 return

1279

1280 if instance.task_state == task_states.RESIZE_MIGRATING:

1281 # We crashed during resize/migration, so roll back for safety

1282 try:

1283 # NOTE(mriedem): check old_vm_state for STOPPED here, if it's

1284 # not in system_metadata we default to True for backwards

1285 # compatibility

1286 power_on = (instance.system_metadata.get('old_vm_state') !=

1287 vm_states.STOPPED)

1288

1289 block_dev_info = self._get_instance_block_device_info(context,

1290 instance)

1291

1292 migration = objects.Migration.get_by_id_and_instance(

1293 context, instance.migration_context.migration_id,

1294 instance.uuid)

1295 self.driver.finish_revert_migration(context, instance,

1296 net_info, migration, block_dev_info, power_on)

1297

1298 except Exception:

1299 LOG.exception('Failed to revert crashed migration',

1300 instance=instance)

1301 finally:

1302 LOG.info('Instance found in migrating state during '

1303 'startup. Resetting task_state',

1304 instance=instance)

1305 instance.task_state = None

1306 instance.save()

1307 if instance.task_state == task_states.MIGRATING:

1308 # Live migration did not complete, but instance is on this

1309 # host. Abort ongoing migration if still running and reset state.

1310 self._reset_live_migration(context, instance)

1311

1312 db_state = instance.power_state

1313 drv_state = self._get_power_state(instance)

1314 expect_running = (db_state == power_state.RUNNING and

1315 drv_state != db_state)

1316

1317 LOG.debug('Current state is %(drv_state)s, state in DB is '

1318 '%(db_state)s.',

1319 {'drv_state': drv_state, 'db_state': db_state},

1320 instance=instance)

1321

1322 if expect_running and CONF.resume_guests_state_on_host_boot:

1323 self._resume_guests_state(context, instance, net_info)

1324

1325 def _resume_guests_state(self, context, instance, net_info):

1326 LOG.info('Rebooting instance after nova-compute restart.',

1327 instance=instance)

1328 block_device_info = \

1329 self._get_instance_block_device_info(context, instance)

1330

1331 share_info = self._get_share_info(context, instance)

1332 self._mount_all_shares(context, instance, share_info)

1333

1334 try:

1335 self.driver.resume_state_on_host_boot(

1336 context, instance, net_info, share_info, block_device_info)

1337 except NotImplementedError:

1338 LOG.warning('Hypervisor driver does not support '

1339 'resume guests', instance=instance)

1340 except Exception:

1341 # NOTE(vish): The instance failed to resume, so we set the

1342 # instance to error and attempt to continue.

1343 LOG.warning('Failed to resume instance',

1344 instance=instance)

1345 self._set_instance_obj_error_state(instance)

1346

1347 def _retry_reboot(self, instance, current_power_state):

1348 current_task_state = instance.task_state

1349 retry_reboot = False

1350 reboot_type = compute_utils.get_reboot_type(current_task_state,

1351 current_power_state)

1352

1353 pending_soft = (

1354 current_task_state == task_states.REBOOT_PENDING and

1355 instance.vm_state in vm_states.ALLOW_SOFT_REBOOT)

1356 pending_hard = (

1357 current_task_state == task_states.REBOOT_PENDING_HARD and

1358 instance.vm_state in vm_states.ALLOW_HARD_REBOOT)

1359 started_not_running = (current_task_state in

1360 [task_states.REBOOTING,

1361 task_states.REBOOTING_HARD,

1362 task_states.REBOOT_STARTED,

1363 task_states.REBOOT_STARTED_HARD] and

1364 current_power_state != power_state.RUNNING)

1365

1366 if pending_soft or pending_hard or started_not_running:

1367 retry_reboot = True

1368

1369 return retry_reboot, reboot_type

1370

1371 def handle_lifecycle_event(self, event):

1372 LOG.info("VM %(state)s (Lifecycle Event)",

1373 {'state': event.get_name()},

1374 instance_uuid=event.get_instance_uuid())

1375 context = nova.context.get_admin_context(read_deleted='yes')

1376 vm_power_state = None

1377 event_transition = event.get_transition()

1378 if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED:

1379 vm_power_state = power_state.SHUTDOWN

1380 elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED:

1381 vm_power_state = power_state.RUNNING

1382 elif event_transition in (

1383 virtevent.EVENT_LIFECYCLE_PAUSED,

1384 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED,

1385 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED):

1386 vm_power_state = power_state.PAUSED

1387 elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED:

1388 vm_power_state = power_state.RUNNING

1389 elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED:

1390 vm_power_state = power_state.SUSPENDED

1391 else:

1392 LOG.warning("Unexpected lifecycle event: %d", event_transition)

1393

1394 migrate_finish_statuses = {

1395 # This happens on the source node and indicates live migration

1396 # entered post-copy mode.

1397 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)',

1398 # Suspended for offline migration.

1399 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running'

1400 }

1401

1402 expected_attrs = []

1403 if event_transition in migrate_finish_statuses:

1404 # Join on info_cache since that's needed in migrate_instance_start.

1405 expected_attrs.append('info_cache')

1406 instance = objects.Instance.get_by_uuid(context,

1407 event.get_instance_uuid(),

1408 expected_attrs=expected_attrs)

1409

1410 # Note(lpetrut): The event may be delayed, thus not reflecting

1411 # the current instance power state. In that case, ignore the event.

1412 current_power_state = self._get_power_state(instance)

1413 if current_power_state == vm_power_state:

1414 LOG.debug('Synchronizing instance power state after lifecycle '

1415 'event "%(event)s"; current vm_state: %(vm_state)s, '

1416 'current task_state: %(task_state)s, current DB '

1417 'power_state: %(db_power_state)s, VM power_state: '

1418 '%(vm_power_state)s',

1419 {'event': event.get_name(),

1420 'vm_state': instance.vm_state,

1421 'task_state': instance.task_state,

1422 'db_power_state': instance.power_state,

1423 'vm_power_state': vm_power_state},

1424 instance_uuid=instance.uuid)

1425 self._sync_instance_power_state(context,

1426 instance,

1427 vm_power_state)

1428

1429 # The following checks are for live migration. We want to activate

1430 # the port binding for the destination host before the live migration

1431 # is resumed on the destination host in order to reduce network

1432 # downtime. Otherwise the ports are bound to the destination host

1433 # in post_live_migration_at_destination.

1434 # TODO(danms): Explore options for using a different live migration

1435 # specific callback for this instead of piggy-backing on the

1436 # handle_lifecycle_event callback.

1437 if (instance.task_state == task_states.MIGRATING and

1438 event_transition in migrate_finish_statuses):

1439 status = migrate_finish_statuses[event_transition]

1440 try:

1441 migration = objects.Migration.get_by_instance_and_status(

1442 context, instance.uuid, status)

1443 LOG.debug('Binding ports to destination host: %s',

1444 migration.dest_compute, instance=instance)

1445 # For neutron, migrate_instance_start will activate the

1446 # destination host port bindings, if there are any created by

1447 # conductor before live migration started.

1448 self.network_api.migrate_instance_start(

1449 context, instance, migration)

1450 except exception.MigrationNotFoundByStatus:

1451 LOG.warning("Unable to find migration record with status "

1452 "'%s' for instance. Port binding will happen in "

1453 "post live migration.", status, instance=instance)

1454

1455 def handle_events(self, event):

1456 if isinstance(event, virtevent.LifecycleEvent): 1456 ↛ 1463line 1456 didn't jump to line 1463 because the condition on line 1456 was always true

1457 try:

1458 self.handle_lifecycle_event(event)

1459 except exception.InstanceNotFound:

1460 LOG.debug("Event %s arrived for non-existent instance. The "

1461 "instance was probably deleted.", event)

1462 else:

1463 LOG.debug("Ignoring event %s", event)

1464

1465 def init_virt_events(self):

1466 if CONF.workarounds.handle_virt_lifecycle_events:

1467 self.driver.register_event_listener(self.handle_events)

1468 else:

1469 # NOTE(mriedem): If the _sync_power_states periodic task is

1470 # disabled we should emit a warning in the logs.

1471 if CONF.sync_power_state_interval < 0: 1471 ↛ 1472line 1471 didn't jump to line 1472 because the condition on line 1471 was never true

1472 LOG.warning('Instance lifecycle events from the compute '

1473 'driver have been disabled. Note that lifecycle '

1474 'changes to an instance outside of the compute '

1475 'service will not be synchronized '

1476 'automatically since the _sync_power_states '

1477 'periodic task is also disabled.')

1478 else:

1479 LOG.info('Instance lifecycle events from the compute '

1480 'driver have been disabled. Note that lifecycle '

1481 'changes to an instance outside of the compute '

1482 'service will only be synchronized by the '

1483 '_sync_power_states periodic task.')

1484

1485 def _get_nodes(self, context):

1486 """Queried the ComputeNode objects from the DB that are reported by the

1487 hypervisor.

1488

1489 :param context: the request context

1490 :return: a dict of ComputeNode objects keyed by the UUID of the given

1491 node.

1492 """

1493 try:

1494 node_ids = self.driver.get_nodenames_by_uuid()

1495 except exception.VirtDriverNotReady:

1496 LOG.warning(

1497 "Virt driver is not ready. If this is the first time this "

1498 "service is starting on this host, then you can ignore "

1499 "this warning.")

1500 return {}

1501

1502 nodes = objects.ComputeNodeList.get_all_by_uuids(context,

1503 list(node_ids.keys()))

1504 if not nodes:

1505 # NOTE(danms): This should only happen if the compute_id is

1506 # pre-provisioned on a host that has never started.

1507 LOG.warning('Compute nodes %s for host %s were not found in the '

1508 'database. If this is the first time this service is '

1509 'starting on this host, then you can ignore this '

1510 'warning.',

1511 list(node_ids.keys()), self.host)

1512 return {}

1513

1514 for node in nodes:

1515 if node.hypervisor_hostname != node_ids.get(node.uuid):

1516 raise exception.InvalidConfiguration(

1517 ('My compute node %s has hypervisor_hostname %s '

1518 'but virt driver reports it should be %s. Possible '

1519 'rename detected, refusing to start!') % (

1520 node.uuid, node.hypervisor_hostname,

1521 node_ids.get(node.uuid)))

1522

1523 return {n.uuid: n for n in nodes}

1524

1525 def _ensure_existing_node_identity(self, service_ref):

1526 """If we are upgrading from an older service version, we need

1527 to write our node identity uuid (if not already done) based on

1528 nodes assigned to us in the database.

1529 """

1530 if 'ironic' in CONF.compute_driver.lower():

1531 # We do not persist a single local node identity for

1532 # ironic

1533 return

1534

1535 if service_ref.version >= service_obj.NODE_IDENTITY_VERSION:

1536 # Already new enough, nothing to do here, but make sure that we

1537 # have a UUID file already, as this is not our first time starting.

1538 if nova.virt.node.read_local_node_uuid() is None:

1539 raise exception.InvalidConfiguration(

1540 ('No local node identity found, but this is not our '

1541 'first startup on this host. Refusing to start after '

1542 'potentially having lost that state!'))

1543 return

1544

1545 if nova.virt.node.read_local_node_uuid():

1546 # We already have a local node identity, no migration needed

1547 return

1548

1549 context = nova.context.get_admin_context()

1550 db_nodes = objects.ComputeNodeList.get_all_by_host(context, self.host)

1551 if not db_nodes:

1552 # This means we have no nodes in the database (that we

1553 # know of) and thus have no need to record an existing

1554 # UUID. That is probably strange, so log a warning.

1555 raise exception.InvalidConfiguration(

1556 ('Upgrading from service version %i but found no '

1557 'nodes in the database for host %s to persist '

1558 'locally; Possible rename detected, '

1559 'refusing to start!') % (

1560 service_ref.version, self.host))

1561

1562 if len(db_nodes) > 1:

1563 # If this happens we can't do the right thing, so raise an

1564 # exception to abort host startup

1565 LOG.warning('Multiple nodes found in the database for host %s; '

1566 'unable to persist local node identity automatically')

1567 raise exception.InvalidConfiguration(

1568 'Multiple nodes found in database, manual node uuid '

1569 'configuration required')

1570

1571 nova.virt.node.write_local_node_uuid(db_nodes[0].uuid)

1572

1573 def _check_for_host_rename(self, nodes_by_uuid):

1574 if 'ironic' in CONF.compute_driver.lower():

1575 # Ironic (currently) rebalances nodes at various times, and as

1576 # such, nodes being discovered as assigned to this host with a

1577 # different hostname is not surprising. Skip this check for

1578 # ironic.

1579 return

1580 for node in nodes_by_uuid.values():

1581 if node.host != self.host:

1582 raise exception.InvalidConfiguration(

1583 'My node %s has host %r but my host is %r; '

1584 'Possible rename detected, refusing to start!' % (

1585 node.uuid, node.host, self.host))

1586 LOG.debug('Verified node %s matches my host %s',

1587 node.uuid, self.host)

1588

1589 def _sanity_check_new_host(self):

1590 instances_on_hv = self.driver.list_instance_uuids()

1591 if len(instances_on_hv) > 0:

1592 # This means we have instances on our hypervisor, but we think

1593 # we are a new host (i.e. we created a new service record). That

1594 # likely means we're pointed at an empty database or the wrong

1595 # cell.

1596 raise exception.InvalidConfiguration(

1597 'My hypervisor has existing instances, but I appear to be '

1598 'a new service in this database. Possible database '

1599 'configuration error, refusing to start!')

1600

1601 def init_host(self, service_ref):

1602 """Initialization for a standalone compute service."""

1603

1604 if service_ref:

1605 # If we are an existing service, check to see if we need

1606 # to record a locally-persistent node identity because

1607 # we have upgraded from a previous version.

1608 self._ensure_existing_node_identity(service_ref)

1609 else:

1610 # If we are a new service (in the database), make sure we have no

1611 # instances on our hypervisor as we would expect.

1612 self._sanity_check_new_host()

1613

1614 if CONF.pci.device_spec:

1615 # Simply loading the PCI passthrough spec will do a bunch of

1616 # validation that would otherwise wait until the PciDevTracker is

1617 # constructed when updating available resources for the compute

1618 # node(s) in the resource tracker, effectively killing that task.

1619 # So load up the spec when starting the compute service to

1620 # flush any invalid configuration early, so we can kill the service

1621 # if the configuration is wrong.

1622 whitelist.Whitelist(CONF.pci.device_spec)

1623

1624 nova.conf.neutron.register_dynamic_opts(CONF)

1625 # Even if only libvirt uses them, make it available for all drivers

1626 nova.conf.devices.register_dynamic_opts(CONF)

1627

1628 # Override the number of concurrent disk operations allowed if the

1629 # user has specified a limit.

1630 if CONF.compute.max_concurrent_disk_ops != 0: 1630 ↛ 1631line 1630 didn't jump to line 1631 because the condition on line 1630 was never true

1631 compute_utils.disk_ops_semaphore = \

1632 eventlet.semaphore.BoundedSemaphore(

1633 CONF.compute.max_concurrent_disk_ops)

1634

1635 if CONF.compute.max_disk_devices_to_attach == 0:

1636 msg = _('[compute]max_disk_devices_to_attach has been set to 0, '

1637 'which will prevent instances from being able to boot. '

1638 'Set -1 for unlimited or set >= 1 to limit the maximum '

1639 'number of disk devices.')

1640 raise exception.InvalidConfiguration(msg)

1641

1642 self.driver.init_host(host=self.host)

1643

1644 # NOTE(gibi): At this point the compute_nodes of the resource tracker

1645 # has not been populated yet so we cannot rely on the resource tracker

1646 # here.

1647 context = nova.context.get_admin_context()

1648 nodes_by_uuid = self._get_nodes(context)

1649

1650 # NOTE(danms): Check for a possible host rename and abort

1651 # startup before we start mucking with instances we think are

1652 # ours.

1653 self._check_for_host_rename(nodes_by_uuid)

1654

1655 instances = objects.InstanceList.get_by_host(

1656 context, self.host,

1657 expected_attrs=['info_cache', 'metadata', 'numa_topology'])

1658

1659 self.init_virt_events()

1660

1661 self._validate_pinning_configuration(instances)

1662 self._validate_vtpm_configuration(instances)

1663

1664 # NOTE(gibi): If ironic and vcenter virt driver slow start time

1665 # becomes problematic here then we should consider adding a config

1666 # option or a driver flag to tell us if we should thread

1667 # _destroy_evacuated_instances and

1668 # _error_out_instances_whose_build_was_interrupted out in the

1669 # background on startup

1670 try:

1671 # checking that instance was not already evacuated to other host

1672 evacuated_instances = self._destroy_evacuated_instances(

1673 context, nodes_by_uuid)

1674

1675 # Initialise instances on the host that are not evacuating

1676 for instance in instances:

1677 if instance.uuid not in evacuated_instances:

1678 self._init_instance(context, instance)

1679

1680 # NOTE(gibi): collect all the instance uuids that is in some way

1681 # was already handled above. Either by init_instance or by

1682 # _destroy_evacuated_instances. This way we can limit the scope of

1683 # the _error_out_instances_whose_build_was_interrupted call to look

1684 # only for instances that have allocations on this node and not

1685 # handled by the above calls.

1686 already_handled = {instance.uuid for instance in instances}.union(

1687 evacuated_instances)

1688 self._error_out_instances_whose_build_was_interrupted(

1689 context, already_handled, nodes_by_uuid.keys())

1690

1691 finally:

1692 if instances:

1693 # We only send the instance info to the scheduler on startup

1694 # if there is anything to send, otherwise this host might

1695 # not be mapped yet in a cell and the scheduler may have

1696 # issues dealing with the information. Later changes to

1697 # instances on this host will update the scheduler, or the

1698 # _sync_scheduler_instance_info periodic task will.

1699 self._update_scheduler_instance_info(context, instances)

1700

1701 def _error_out_instances_whose_build_was_interrupted(

1702 self, context, already_handled_instances, node_uuids):

1703 """If there are instances in BUILDING state that are not

1704 assigned to this host but have allocations in placement towards

1705 this compute that means the nova-compute service was

1706 restarted while those instances waited for the resource claim

1707 to finish and the _set_instance_host_and_node() to update the

1708 instance.host field. We need to push them to ERROR state here to

1709 prevent keeping them in BUILDING state forever.

1710

1711 :param context: The request context

1712 :param already_handled_instances: The set of instance UUIDs that the

1713 host initialization process already handled in some way.

1714 :param node_uuids: The list of compute node uuids handled by this

1715 service

1716 """

1717

1718 # Strategy:

1719 # 1) Get the allocations from placement for our compute node(s)

1720 # 2) Remove the already handled instances from the consumer list;

1721 # they are either already initialized or need to be skipped.

1722 # 3) Check which remaining consumer is an instance in BUILDING state

1723 # and push it to ERROR state.

1724

1725 LOG.info(

1726 "Looking for unclaimed instances stuck in BUILDING status for "

1727 "nodes managed by this host")

1728 for cn_uuid in node_uuids:

1729 try:

1730 f = self.reportclient.get_allocations_for_resource_provider

1731 allocations = f(context, cn_uuid).allocations

1732 except (exception.ResourceProviderAllocationRetrievalFailed,

1733 keystone_exception.ClientException) as e:

1734 LOG.error(

1735 "Could not retrieve compute node resource provider %s and "

1736 "therefore unable to error out any instances stuck in "

1737 "BUILDING state. Error: %s", cn_uuid, str(e))

1738 continue

1739

1740 not_handled_consumers = (set(allocations) -

1741 already_handled_instances)

1742

1743 if not not_handled_consumers:

1744 continue

1745

1746 filters = {

1747 'vm_state': vm_states.BUILDING,

1748 'uuid': not_handled_consumers

1749 }

1750

1751 instances = objects.InstanceList.get_by_filters(

1752 context, filters, expected_attrs=[])

1753

1754 for instance in instances:

1755 LOG.debug(

1756 "Instance spawn was interrupted before instance_claim, "

1757 "setting instance to ERROR state", instance=instance)

1758 self._set_instance_obj_error_state(

1759 instance, clean_task_state=True)

1760

1761 def cleanup_host(self):

1762 self.driver.register_event_listener(None)

1763 self.instance_events.cancel_all_events()

1764 self.driver.cleanup_host(host=self.host)

1765 self._cleanup_live_migrations_in_pool()

1766

1767 def _cleanup_live_migrations_in_pool(self):

1768 # Shutdown the pool so we don't get new requests.

1769 self._live_migration_executor.shutdown(wait=False)

1770 # For any queued migrations, cancel the migration and update

1771 # its status.

1772 for migration, future in self._waiting_live_migrations.values():

1773 # If we got here before the Future was submitted then we need

1774 # to move on since there isn't anything we can do.

1775 if future is None:

1776 continue

1777 if future.cancel(): 1777 ↛ 1782line 1777 didn't jump to line 1782 because the condition on line 1777 was always true

1778 self._set_migration_status(migration, 'cancelled')

1779 LOG.info('Successfully cancelled queued live migration.',

1780 instance_uuid=migration.instance_uuid)

1781 else:

1782 LOG.warning('Unable to cancel live migration.',

1783 instance_uuid=migration.instance_uuid)

1784 self._waiting_live_migrations.clear()

1785

1786 def pre_start_hook(self, service_ref):

1787 """After the service is initialized, but before we fully bring

1788 the service up by listening on RPC queues, make sure to update

1789 our available resources (and indirectly our available nodes).

1790 """

1791 self.service_ref = service_ref

1792 self.rt.set_service_ref(service_ref)

1793 self.update_available_resource(nova.context.get_admin_context(),

1794 startup=True)

1795

1796 def _get_power_state(self, instance):

1797 """Retrieve the power state for the given instance."""

1798 LOG.debug('Checking state', instance=instance)

1799 try:

1800 return self.driver.get_info(instance, use_cache=False).state

1801 except exception.InstanceNotFound:

1802 return power_state.NOSTATE

1803

1804 def _await_block_device_map_created(self, context, vol_id):

1805 # TODO(yamahata): creating volume simultaneously

1806 # reduces creation time?

1807 # TODO(yamahata): eliminate dumb polling

1808 start = time.time()

1809 retries = CONF.block_device_allocate_retries

1810 # (1) if the configured value is 0, one attempt should be made

1811 # (2) if the configured value is > 0, then the total number attempts

1812 # is (retries + 1)

1813 attempts = 1

1814 if retries >= 1:

1815 attempts = retries + 1

1816 for attempt in range(1, attempts + 1):

1817 volume = self.volume_api.get(context, vol_id)

1818 volume_status = volume['status']

1819 if volume_status not in ['creating', 'downloading']:

1820 if volume_status == 'available':

1821 return attempt

1822 LOG.warning("Volume id: %(vol_id)s finished being "

1823 "created but its status is %(vol_status)s.",

1824 {'vol_id': vol_id,

1825 'vol_status': volume_status})

1826 break

1827 greenthread.sleep(CONF.block_device_allocate_retries_interval)

1828 raise exception.VolumeNotCreated(volume_id=vol_id,

1829 seconds=int(time.time() - start),

1830 attempts=attempt,

1831 volume_status=volume_status)

1832

1833 def _decode_files(self, injected_files):

1834 """Base64 decode the list of files to inject."""

1835 if not injected_files:

1836 return []

1837

1838 def _decode(f):

1839 path, contents = f

1840 # Py3 raises binascii.Error instead of TypeError as in Py27

1841 try:

1842 decoded = base64.b64decode(contents)

1843 return path, decoded

1844 except (TypeError, binascii.Error):

1845 raise exception.Base64Exception(path=path)

1846

1847 return [_decode(f) for f in injected_files]

1848

1849 def _validate_instance_group_policy(self, context, instance,

1850 scheduler_hints=None):

1851

1852 if CONF.workarounds.disable_group_policy_check_upcall:

1853 return

1854

1855 # NOTE(russellb) Instance group policy is enforced by the scheduler.

1856 # However, there is a race condition with the enforcement of

1857 # the policy. Since more than one instance may be scheduled at the

1858 # same time, it's possible that more than one instance with an

1859 # anti-affinity policy may end up here. It's also possible that

1860 # multiple instances with an affinity policy could end up on different

1861 # hosts. This is a validation step to make sure that starting the

1862 # instance here doesn't violate the policy.

1863 if scheduler_hints is not None:

1864 # only go through here if scheduler_hints is provided,

1865 # even if it is empty.

1866 group_hint = scheduler_hints.get('group')

1867 if not group_hint:

1868 return

1869 else:

1870 # The RequestSpec stores scheduler_hints as key=list pairs

1871 # so we need to check the type on the value and pull the

1872 # single entry out. The API request schema validates that

1873 # the 'group' hint is a single value.

1874 if isinstance(group_hint, list):

1875 group_hint = group_hint[0]

1876 try:

1877 group = objects.InstanceGroup.get_by_hint(

1878 context, group_hint

1879 )

1880 except exception.InstanceGroupNotFound:

1881 return

1882 else:

1883 # TODO(ganso): a call to DB can be saved by adding request_spec

1884 # to rpcapi payload of live_migration, pre_live_migration and

1885 # check_can_live_migrate_destination

1886 try:

1887 group = objects.InstanceGroup.get_by_instance_uuid(

1888 context, instance.uuid

1889 )

1890 except exception.InstanceGroupNotFound:

1891 return

1892

1893 @utils.synchronized(group['uuid'])

1894 def _do_validation(context, instance, group):

1895 if group.policy and 'anti-affinity' == group.policy:

1896

1897 # instances on host

1898 instances_uuids = objects.InstanceList.get_uuids_by_host(

1899 context, self.host)

1900 ins_on_host = set(instances_uuids)

1901

1902 # instance param is just for logging, the nodename obtained is

1903 # not actually related to the instance at all

1904 nodename = self._get_nodename(instance)

1905

1906 # instances being migrated to host

1907 migrations = (

1908 objects.MigrationList.get_in_progress_by_host_and_node(

1909 context, self.host, nodename))

1910 migration_vm_uuids = {mig.instance_uuid for mig in migrations}

1911

1912 total_instances = migration_vm_uuids | ins_on_host

1913

1914 # refresh group to get updated members within locked block

1915 group = objects.InstanceGroup.get_by_uuid(context,

1916 group['uuid'])

1917 members = set(group.members)

1918 # Determine the set of instance group members on this host

1919 # which are not the instance in question. This is used to

1920 # determine how many other members from the same anti-affinity

1921 # group can be on this host.

1922 members_on_host = (total_instances & members -

1923 set([instance.uuid]))

1924 rules = group.rules

1925 if rules and 'max_server_per_host' in rules: 1925 ↛ 1928line 1925 didn't jump to line 1928 because the condition on line 1925 was always true

1926 max_server = rules['max_server_per_host']

1927 else:

1928 max_server = 1

1929 if len(members_on_host) >= max_server:

1930 raise exception.GroupAffinityViolation(

1931 instance_uuid=instance.uuid, policy='Anti-affinity')

1932

1933 # NOTE(ganso): The check for affinity below does not work and it

1934 # can easily be violated because the lock happens in different

1935 # compute hosts.

1936 # The only fix seems to be a DB lock to perform the check whenever

1937 # setting the host field to an instance.

1938 elif group.policy and 'affinity' == group.policy: 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true

1939 group_hosts = group.get_hosts(exclude=[instance.uuid])

1940 if group_hosts and self.host not in group_hosts:

1941 raise exception.GroupAffinityViolation(

1942 instance_uuid=instance.uuid, policy='Affinity')

1943

1944 _do_validation(context, instance, group)

1945

1946 def _log_original_error(self, exc_info, instance_uuid):

1947 LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid,

1948 exc_info=exc_info)

1949

1950 @periodic_task.periodic_task

1951 def _check_instance_build_time(self, context):

1952 """Ensure that instances are not stuck in build."""

1953 timeout = CONF.instance_build_timeout

1954 if timeout == 0:

1955 return

1956

1957 filters = {'vm_state': vm_states.BUILDING,

1958 'host': self.host}

1959

1960 building_insts = objects.InstanceList.get_by_filters(context,

1961 filters, expected_attrs=[], use_slave=True)

1962

1963 for instance in building_insts:

1964 if timeutils.is_older_than(instance.created_at, timeout):

1965 self._set_instance_obj_error_state(instance)

1966 LOG.warning("Instance build timed out. Set to error "

1967 "state.", instance=instance)

1968

1969 def _check_instance_exists(self, instance):

1970 """Ensure an instance with the same name is not already present."""

1971 if self.driver.instance_exists(instance):

1972 raise exception.InstanceExists(name=instance.name)

1973

1974 def _allocate_network_async(self, context, instance, requested_networks,

1975 security_groups, resource_provider_mapping,

1976 network_arqs):

1977 """Method used to allocate networks in the background.

1978

1979 Broken out for testing.

1980 """

1981 # First check to see if we're specifically not supposed to allocate

1982 # networks because if so, we can exit early.

1983 if requested_networks and requested_networks.no_allocate:

1984 LOG.debug("Not allocating networking since 'none' was specified.",

1985 instance=instance)

1986 return network_model.NetworkInfo([])

1987

1988 LOG.debug("Allocating IP information in the background.",

1989 instance=instance)

1990 retries = CONF.network_allocate_retries

1991 attempts = retries + 1

1992 retry_time = 1

1993 bind_host_id = self.driver.network_binding_host_id(context, instance)

1994 for attempt in range(1, attempts + 1): 1994 ↛ exitline 1994 didn't return from function '_allocate_network_async' because the loop on line 1994 didn't complete

1995 try:

1996 nwinfo = self.network_api.allocate_for_instance(

1997 context, instance,

1998 requested_networks=requested_networks,

1999 security_groups=security_groups,

2000 bind_host_id=bind_host_id,

2001 resource_provider_mapping=resource_provider_mapping,

2002 network_arqs=network_arqs)

2003 LOG.debug('Instance network_info: |%s|', nwinfo,

2004 instance=instance)

2005 instance.system_metadata['network_allocated'] = 'True'

2006 # NOTE(JoshNang) do not save the instance here, as it can cause

2007 # races. The caller shares a reference to instance and waits

2008 # for this async greenthread to finish before calling

2009 # instance.save().

2010 return nwinfo

2011 except Exception as e:

2012 log_info = {'attempt': attempt,

2013 'attempts': attempts}

2014 if attempt == attempts:

2015 LOG.exception('Instance failed network setup '

2016 'after %(attempts)d attempt(s)',

2017 log_info)

2018 raise e

2019 LOG.warning('Instance failed network setup '

2020 '(attempt %(attempt)d of %(attempts)d)',

2021 log_info, instance=instance)

2022 time.sleep(retry_time)

2023 retry_time *= 2

2024 if retry_time > 30:

2025 retry_time = 30

2026 # Not reached.

2027

2028 def _build_networks_for_instance(self, context, instance,

2029 requested_networks, security_groups, resource_provider_mapping,

2030 network_arqs):

2031

2032 # If we're here from a reschedule the network may already be allocated.

2033 if strutils.bool_from_string(

2034 instance.system_metadata.get('network_allocated', 'False')):

2035 # NOTE(alex_xu): The network_allocated is True means the network

2036 # resource already allocated at previous scheduling, and the

2037 # network setup is cleanup at previous. After rescheduling, the

2038 # network resource need setup on the new host.

2039 self.network_api.setup_instance_network_on_host(

2040 context, instance, instance.host)

2041 return self.network_api.get_instance_nw_info(context, instance)

2042

2043 network_info = self._allocate_network(context, instance,

2044 requested_networks, security_groups,

2045 resource_provider_mapping,

2046 network_arqs)

2047

2048 return network_info

2049

2050 def _allocate_network(self, context, instance, requested_networks,

2051 security_groups, resource_provider_mapping,

2052 network_arqs):

2053 """Start network allocation asynchronously. Return an instance

2054 of NetworkInfoAsyncWrapper that can be used to retrieve the

2055 allocated networks when the operation has finished.

2056 """

2057 # NOTE(comstud): Since we're allocating networks asynchronously,

2058 # this task state has little meaning, as we won't be in this

2059 # state for very long.

2060 instance.vm_state = vm_states.BUILDING

2061 instance.task_state = task_states.NETWORKING

2062 instance.save(expected_task_state=[None])

2063

2064 return network_model.NetworkInfoAsyncWrapper(

2065 self._allocate_network_async, context, instance,

2066 requested_networks, security_groups, resource_provider_mapping,

2067 network_arqs)

2068

2069 def _default_root_device_name(self, instance, image_meta, root_bdm):

2070 """Gets a default root device name from the driver.

2071

2072 :param nova.objects.Instance instance:

2073 The instance for which to get the root device name.

2074 :param nova.objects.ImageMeta image_meta:

2075 The metadata of the image of the instance.

2076 :param nova.objects.BlockDeviceMapping root_bdm:

2077 The description of the root device.

2078 :returns: str -- The default root device name.

2079 :raises: InternalError, TooManyDiskDevices

2080 """

2081 try:

2082 return self.driver.default_root_device_name(instance,

2083 image_meta,

2084 root_bdm)

2085 except NotImplementedError:

2086 return compute_utils.get_next_device_name(instance, [])

2087

2088 def _default_device_names_for_instance(self, instance,

2089 root_device_name,

2090 *block_device_lists):

2091 """Default the missing device names in the BDM from the driver.

2092

2093 :param nova.objects.Instance instance:

2094 The instance for which to get default device names.

2095 :param str root_device_name: The root device name.

2096 :param list block_device_lists: List of block device mappings.

2097 :returns: None

2098 :raises: InternalError, TooManyDiskDevices

2099 """

2100 try:

2101 self.driver.default_device_names_for_instance(instance,

2102 root_device_name,

2103 *block_device_lists)

2104 except NotImplementedError:

2105 compute_utils.default_device_names_for_instance(

2106 instance, root_device_name, *block_device_lists)

2107

2108 def _get_device_name_for_instance(self, instance, bdms, block_device_obj):

2109 """Get the next device name from the driver, based on the BDM.

2110

2111 :param nova.objects.Instance instance:

2112 The instance whose volume is requesting a device name.

2113 :param nova.objects.BlockDeviceMappingList bdms:

2114 The block device mappings for the instance.

2115 :param nova.objects.BlockDeviceMapping block_device_obj:

2116 A block device mapping containing info about the requested block

2117 device.

2118 :returns: The next device name.

2119 :raises: InternalError, TooManyDiskDevices

2120 """

2121 # NOTE(ndipanov): Copy obj to avoid changing the original

2122 block_device_obj = block_device_obj.obj_clone()

2123 try:

2124 return self.driver.get_device_name_for_instance(

2125 instance, bdms, block_device_obj)

2126 except NotImplementedError:

2127 return compute_utils.get_device_name_for_instance(

2128 instance, bdms, block_device_obj.get("device_name"))

2129

2130 def _default_block_device_names(self, instance, image_meta, block_devices):

2131 """Verify that all the devices have the device_name set. If not,

2132 provide a default name.

2133

2134 It also ensures that there is a root_device_name and is set to the

2135 first block device in the boot sequence (boot_index=0).

2136 """

2137 root_bdm = block_device.get_root_bdm(block_devices)

2138 if not root_bdm:

2139 return

2140

2141 # Get the root_device_name from the root BDM or the instance

2142 root_device_name = None

2143 update_root_bdm = False

2144

2145 if root_bdm.device_name:

2146 root_device_name = root_bdm.device_name

2147 instance.root_device_name = root_device_name

2148 elif instance.root_device_name:

2149 root_device_name = instance.root_device_name

2150 root_bdm.device_name = root_device_name

2151 update_root_bdm = True

2152 else:

2153 root_device_name = self._default_root_device_name(instance,

2154 image_meta,

2155 root_bdm)

2156

2157 instance.root_device_name = root_device_name

2158 root_bdm.device_name = root_device_name

2159 update_root_bdm = True

2160

2161 if update_root_bdm:

2162 root_bdm.save()

2163

2164 ephemerals = []

2165 swap = []

2166 block_device_mapping = []

2167 image = []

2168

2169 for device in block_devices:

2170 if block_device.new_format_is_ephemeral(device):

2171 ephemerals.append(device)

2172

2173 if block_device.new_format_is_swap(device):

2174 swap.append(device)

2175

2176 if driver_block_device.is_block_device_mapping(device):

2177 block_device_mapping.append(device)

2178

2179 if driver_block_device.is_local_image(device): 2179 ↛ 2180line 2179 didn't jump to line 2180 because the condition on line 2179 was never true

2180 image.append(device)

2181

2182 self._default_device_names_for_instance(instance,

2183 root_device_name,

2184 image,

2185 ephemerals,

2186 swap,

2187 block_device_mapping)

2188

2189 def _add_missing_dev_names(self, bdms, instance):

2190 for bdm in bdms:

2191 if bdm.device_name is not None:

2192 continue

2193

2194 device_name = self._get_device_name_for_instance(instance,

2195 bdms, bdm)

2196 values = {'device_name': device_name}

2197 bdm.update(values)

2198 bdm.save()

2199

2200 def _prep_block_device(self, context, instance, bdms):

2201 """Set up the block device for an instance with error logging."""

2202 try:

2203 self._add_missing_dev_names(bdms, instance)

2204 block_device_info = driver.get_block_device_info(instance, bdms)

2205 mapping = driver.block_device_info_get_mapping(block_device_info)

2206 driver_block_device.attach_block_devices(

2207 mapping, context, instance, self.volume_api, self.driver,

2208 wait_func=self._await_block_device_map_created)

2209

2210 return block_device_info

2211

2212 except exception.OverQuota as e:

2213 LOG.warning('Failed to create block device for instance due'

2214 ' to exceeding volume related resource quota.'

2215 ' Error: %s', e.message, instance=instance)

2216 raise

2217

2218 except Exception as ex:

2219 LOG.exception('Instance failed block device setup',

2220 instance=instance)

2221 # InvalidBDM will eventually result in a BuildAbortException when

2222 # booting from volume, and will be recorded as an instance fault.

2223 # Maintain the original exception message which most likely has

2224 # useful details which the standard InvalidBDM error message lacks.

2225 raise exception.InvalidBDM(str(ex))

2226

2227 def _update_instance_after_spawn(self, instance,

2228 vm_state=vm_states.ACTIVE):

2229 instance.power_state = self._get_power_state(instance)

2230 instance.vm_state = vm_state

2231 instance.task_state = None

2232 # NOTE(sean-k-mooney): configdrive.update_instance checks

2233 # instance.launched_at to determine if it is the first or

2234 # subsequent spawn of an instance. We need to call update_instance

2235 # first before setting instance.launched_at or instance.config_drive

2236 # will never be set to true based on the value of force_config_drive.

2237 # As a result the config drive will be lost on a hard reboot of the

2238 # instance even when force_config_drive=true. see bug #1835822.

2239 configdrive.update_instance(instance)

2240 instance.launched_at = timeutils.utcnow()

2241

2242 def _update_scheduler_instance_info(self, context, instance):

2243 """Sends an InstanceList with created or updated Instance objects to

2244 the Scheduler client.

2245

2246 In the case of init_host, the value passed will already be an

2247 InstanceList. Other calls will send individual Instance objects that

2248 have been created or resized. In this case, we create an InstanceList

2249 object containing that Instance.

2250 """

2251 if not self.send_instance_updates:

2252 return

2253 if isinstance(instance, obj_instance.Instance):

2254 instance = objects.InstanceList(objects=[instance])

2255 context = context.elevated()

2256 self.query_client.update_instance_info(context, self.host,

2257 instance)

2258

2259 def _delete_scheduler_instance_info(self, context, instance_uuid):

2260 """Sends the uuid of the deleted Instance to the Scheduler client."""

2261 if not self.send_instance_updates:

2262 return

2263 context = context.elevated()

2264 self.query_client.delete_instance_info(context, self.host,

2265 instance_uuid)

2266

2267 @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval)

2268 def _sync_scheduler_instance_info(self, context):

2269 if not self.send_instance_updates:

2270 return

2271 context = context.elevated()

2272 instances = objects.InstanceList.get_by_host(context, self.host,

2273 expected_attrs=[],

2274 use_slave=True)

2275 uuids = [instance.uuid for instance in instances]

2276 self.query_client.sync_instance_info(context, self.host, uuids)

2277

2278 def _notify_about_instance_usage(self, context, instance, event_suffix,

2279 network_info=None, extra_usage_info=None,

2280 fault=None, best_effort=False):

2281 compute_utils.notify_about_instance_usage(

2282 self.notifier, context, instance, event_suffix,

2283 network_info=network_info,

2284 extra_usage_info=extra_usage_info, fault=fault,

2285 best_effort=best_effort)

2286

2287 def _deallocate_network(self, context, instance,

2288 requested_networks=None):

2289 # If we were told not to allocate networks let's save ourselves

2290 # the trouble of calling the network API.

2291 if requested_networks and requested_networks.no_allocate:

2292 LOG.debug("Skipping network deallocation for instance since "

2293 "networking was not requested.", instance=instance)

2294 return

2295

2296 LOG.debug('Deallocating network for instance', instance=instance)

2297 with timeutils.StopWatch() as timer:

2298 self.network_api.deallocate_for_instance(

2299 context, instance, requested_networks=requested_networks)

2300 # nova-network does an rpc call so we're OK tracking time spent here

2301 LOG.info('Took %0.2f seconds to deallocate network for instance.',

2302 timer.elapsed(), instance=instance)

2303

2304 def _get_instance_block_device_info(self, context, instance,

2305 refresh_conn_info=False,

2306 bdms=None):

2307 """Transform block devices to the driver block_device format."""

2308

2309 if bdms is None:

2310 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

2311 context, instance.uuid)

2312 block_device_info = driver.get_block_device_info(instance, bdms)

2313

2314 if not refresh_conn_info:

2315 # if the block_device_mapping has no value in connection_info

2316 # (returned as None), don't include in the mapping

2317 block_device_info['block_device_mapping'] = [

2318 bdm for bdm in driver.block_device_info_get_mapping(

2319 block_device_info)

2320 if bdm.get('connection_info')]

2321 else:

2322 driver_block_device.refresh_conn_infos(

2323 driver.block_device_info_get_mapping(block_device_info),

2324 context, instance, self.volume_api, self.driver)

2325

2326 return block_device_info

2327

2328 def _build_failed(self, node):

2329 if CONF.compute.consecutive_build_service_disable_threshold:

2330 # NOTE(danms): Update our counter, but wait for the next

2331 # update_available_resource() periodic to flush it to the DB

2332 self.rt.build_failed(node)

2333

2334 def _build_succeeded(self, node):

2335 self.rt.build_succeeded(node)

2336

2337 @wrap_exception()

2338 @reverts_task_state

2339 @wrap_instance_fault

2340 def build_and_run_instance(self, context, instance, image, request_spec,

2341 filter_properties, accel_uuids, admin_password=None,

2342 injected_files=None, requested_networks=None,

2343 security_groups=None, block_device_mapping=None,

2344 node=None, limits=None, host_list=None):

2345

2346 @utils.synchronized(instance.uuid)

2347 def _locked_do_build_and_run_instance(*args, **kwargs):

2348 # NOTE(danms): We grab the semaphore with the instance uuid

2349 # locked because we could wait in line to build this instance

2350 # for a while and we want to make sure that nothing else tries

2351 # to do anything with this instance while we wait.

2352 with self._build_semaphore:

2353 try:

2354 result = self._do_build_and_run_instance(*args, **kwargs)

2355 except Exception:

2356 # NOTE(mriedem): This should really only happen if

2357 # _decode_files in _do_build_and_run_instance fails, and

2358 # that's before a guest is spawned so it's OK to remove

2359 # allocations for the instance for this node from Placement

2360 # below as there is no guest consuming resources anyway.

2361 # The _decode_files case could be handled more specifically

2362 # but that's left for another day.

2363 result = build_results.FAILED

2364 raise

2365 finally:

2366 if result == build_results.FAILED:

2367 # Remove the allocation records from Placement for the

2368 # instance if the build failed. The instance.host is

2369 # likely set to None in _do_build_and_run_instance

2370 # which means if the user deletes the instance, it

2371 # will be deleted in the API, not the compute service.

2372 # Setting the instance.host to None in

2373 # _do_build_and_run_instance means that the

2374 # ResourceTracker will no longer consider this instance

2375 # to be claiming resources against it, so we want to

2376 # reflect that same thing in Placement. No need to

2377 # call this for a reschedule, as the allocations will

2378 # have already been removed in

2379 # self._do_build_and_run_instance().

2380 self.reportclient.delete_allocation_for_instance(

2381 context, instance.uuid, force=True)

2382

2383 if result in (build_results.FAILED_BY_POLICY,

2384 build_results.RESCHEDULED_BY_POLICY):

2385 return

2386 if result in (build_results.FAILED,

2387 build_results.RESCHEDULED):

2388 self._build_failed(node)

2389 else:

2390 self._build_succeeded(node)

2391

2392 # NOTE(danms): We spawn here to return the RPC worker thread back to

2393 # the pool. Since what follows could take a really long time, we don't

2394 # want to tie up RPC workers.

2395 utils.spawn_n(_locked_do_build_and_run_instance,

2396 context, instance, image, request_spec,

2397 filter_properties, admin_password, injected_files,

2398 requested_networks, security_groups,

2399 block_device_mapping, node, limits, host_list,

2400 accel_uuids)

2401

2402 def _check_device_tagging(self, requested_networks, block_device_mapping):

2403 tagging_requested = False

2404 if requested_networks:

2405 for net in requested_networks:

2406 if 'tag' in net and net.tag is not None:

2407 tagging_requested = True

2408 break

2409 if block_device_mapping and not tagging_requested:

2410 for bdm in block_device_mapping:

2411 if 'tag' in bdm and bdm.tag is not None:

2412 tagging_requested = True

2413 break

2414 if (tagging_requested and

2415 not self.driver.capabilities.get('supports_device_tagging',

2416 False)):

2417 raise exception.BuildAbortException('Attempt to boot guest with '

2418 'tagged devices on host that '

2419 'does not support tagging.')

2420

2421 def _check_trusted_certs(self, instance):

2422 if (instance.trusted_certs and

2423 not self.driver.capabilities.get('supports_trusted_certs',

2424 False)):

2425 raise exception.BuildAbortException(

2426 'Trusted image certificates provided on host that does not '

2427 'support certificate validation.')

2428

2429 @wrap_exception()

2430 @reverts_task_state

2431 @wrap_instance_event(prefix='compute')

2432 @wrap_instance_fault

2433 def _do_build_and_run_instance(self, context, instance, image,

2434 request_spec, filter_properties, admin_password, injected_files,

2435 requested_networks, security_groups, block_device_mapping,

2436 node=None, limits=None, host_list=None, accel_uuids=None):

2437

2438 try:

2439 LOG.debug('Starting instance...', instance=instance)

2440 instance.vm_state = vm_states.BUILDING

2441 instance.task_state = None

2442 instance.save(expected_task_state=

2443 (task_states.SCHEDULING, None))

2444 except exception.InstanceNotFound:

2445 msg = 'Instance disappeared before build.'

2446 LOG.debug(msg, instance=instance)

2447 return build_results.FAILED

2448 except exception.UnexpectedTaskStateError as e:

2449 LOG.debug(e.format_message(), instance=instance)

2450 return build_results.FAILED

2451

2452 # b64 decode the files to inject:

2453 decoded_files = self._decode_files(injected_files)

2454

2455 if limits is None:

2456 limits = {}

2457

2458 if node is None:

2459 node = self._get_nodename(instance, refresh=True)

2460

2461 try:

2462 with timeutils.StopWatch() as timer:

2463 self._build_and_run_instance(context, instance, image,

2464 decoded_files, admin_password, requested_networks,

2465 security_groups, block_device_mapping, node, limits,

2466 filter_properties, request_spec, accel_uuids)

2467 LOG.info('Took %0.2f seconds to build instance.',

2468 timer.elapsed(), instance=instance)

2469 return build_results.ACTIVE

2470 except exception.RescheduledException as e:

2471 retry = filter_properties.get('retry')

2472 if not retry:

2473 # no retry information, do not reschedule.

2474 LOG.debug("Retry info not present, will not reschedule",

2475 instance=instance)

2476 self._cleanup_allocated_networks(context, instance,

2477 requested_networks)

2478 compute_utils.add_instance_fault_from_exc(context,

2479 instance, e, sys.exc_info(),

2480 fault_message=e.kwargs['reason'])

2481 self._nil_out_instance_obj_host_and_node(instance)

2482 self._set_instance_obj_error_state(instance,

2483 clean_task_state=True)

2484 if isinstance(e, exception.RescheduledByPolicyException):

2485 return build_results.FAILED_BY_POLICY

2486 return build_results.FAILED

2487 LOG.debug(e.format_message(), instance=instance)

2488 # This will be used for logging the exception

2489 retry['exc'] = traceback.format_exception(*sys.exc_info())

2490 # This will be used for setting the instance fault message

2491 retry['exc_reason'] = e.kwargs['reason']

2492

2493 self._cleanup_allocated_networks(context, instance,

2494 requested_networks)

2495

2496 self._nil_out_instance_obj_host_and_node(instance)

2497 instance.task_state = task_states.SCHEDULING

2498 instance.save()

2499 # The instance will have already claimed resources from this host

2500 # before this build was attempted. Now that it has failed, we need

2501 # to unclaim those resources before casting to the conductor, so

2502 # that if there are alternate hosts available for a retry, it can

2503 # claim resources on that new host for the instance.

2504 self.reportclient.delete_allocation_for_instance(

2505 context, instance.uuid, force=True)

2506

2507 self.compute_task_api.build_instances(context, [instance],

2508 image, filter_properties, admin_password,

2509 injected_files, requested_networks, security_groups,

2510 block_device_mapping, request_spec=request_spec,

2511 host_lists=[host_list])

2512

2513 if isinstance(e, exception.RescheduledByPolicyException):

2514 return build_results.RESCHEDULED_BY_POLICY

2515

2516 return build_results.RESCHEDULED

2517 except (exception.InstanceNotFound,

2518 exception.UnexpectedDeletingTaskStateError):

2519 msg = 'Instance disappeared during build.'

2520 LOG.debug(msg, instance=instance)

2521 self._cleanup_allocated_networks(context, instance,

2522 requested_networks)

2523 return build_results.FAILED

2524 except Exception as e:

2525 if isinstance(e, exception.BuildAbortException):

2526 LOG.error(e.format_message(), instance=instance)

2527 else:

2528 # Should not reach here.

2529 LOG.exception('Unexpected build failure, not rescheduling '

2530 'build.', instance=instance)

2531 self._cleanup_allocated_networks(context, instance,

2532 requested_networks)

2533 self._cleanup_volumes(context, instance,

2534 block_device_mapping, raise_exc=False)

2535 compute_utils.add_instance_fault_from_exc(context, instance,

2536 e, sys.exc_info())

2537 self._nil_out_instance_obj_host_and_node(instance)

2538 self._set_instance_obj_error_state(instance, clean_task_state=True)

2539 return build_results.FAILED

2540

2541 @staticmethod

2542 def _get_scheduler_hints(filter_properties, request_spec=None):

2543 """Helper method to get scheduler hints.

2544

2545 This method prefers to get the hints out of the request spec, but that

2546 might not be provided. Conductor will pass request_spec down to the

2547 first compute chosen for a build but older computes will not pass

2548 the request_spec to conductor's build_instances method for a

2549 a reschedule, so if we're on a host via a retry, request_spec may not

2550 be provided so we need to fallback to use the filter_properties

2551 to get scheduler hints.

2552 """

2553 hints = {}

2554 if request_spec is not None and 'scheduler_hints' in request_spec:

2555 hints = request_spec.scheduler_hints

2556 if not hints:

2557 hints = filter_properties.get('scheduler_hints') or {}

2558 return hints

2559

2560 @staticmethod

2561 def _get_request_group_mapping(request_spec):

2562 """Return request group resource - provider mapping. This is currently

2563 used for Neutron ports that have resource request due to the port

2564 having QoS minimum bandwidth policy rule attached.

2565

2566 :param request_spec: A RequestSpec object or None

2567 :returns: A dict keyed by RequestGroup requester_id, currently Neutron

2568 port_id, to resource provider UUID that provides resource for that

2569 RequestGroup. Or None if the request_spec was None.

2570 """

2571 # TODO(sbauza): Remove this conditional once we only support

2572 # RPC API 6.0

2573 if request_spec:

2574 return request_spec.get_request_group_mapping()

2575 else:

2576 return None

2577

2578 def _build_and_run_instance(self, context, instance, image, injected_files,

2579 admin_password, requested_networks, security_groups,

2580 block_device_mapping, node, limits, filter_properties,

2581 request_spec=None, accel_uuids=None):

2582

2583 image_name = image.get('name')

2584 self._notify_about_instance_usage(context, instance, 'create.start',

2585 extra_usage_info={'image_name': image_name})

2586 compute_utils.notify_about_instance_create(

2587 context, instance, self.host,

2588 phase=fields.NotificationPhase.START,

2589 bdms=block_device_mapping)

2590

2591 # NOTE(mikal): cache the keystone roles associated with the instance

2592 # at boot time for later reference

2593 instance.system_metadata.update(

2594 {'boot_roles': ','.join(context.roles)})

2595

2596 self._check_device_tagging(requested_networks, block_device_mapping)

2597 self._check_trusted_certs(instance)

2598

2599 provider_mapping = self._get_request_group_mapping(request_spec)

2600

2601 if provider_mapping:

2602 try:

2603 compute_utils.update_pci_request_with_placement_allocations(

2604 context,

2605 self.reportclient,

2606 instance.pci_requests.requests,

2607 provider_mapping,

2608 )

2609 except (exception.AmbiguousResourceProviderForPCIRequest,

2610 exception.UnexpectedResourceProviderNameForPCIRequest

2611 ) as e:

2612 raise exception.BuildAbortException(

2613 reason=str(e), instance_uuid=instance.uuid)

2614

2615 # TODO(Luyao) cut over to get_allocs_for_consumer

2616 allocs = self.reportclient.get_allocations_for_consumer(

2617 context, instance.uuid)

2618

2619 try:

2620 scheduler_hints = self._get_scheduler_hints(filter_properties,

2621 request_spec)

2622 with self.rt.instance_claim(context, instance, node, allocs,

2623 limits):

2624 # NOTE(russellb) It's important that this validation be done

2625 # *after* the resource tracker instance claim, as that is where

2626 # the host is set on the instance.

2627 self._validate_instance_group_policy(context, instance,

2628 scheduler_hints)

2629 image_meta = objects.ImageMeta.from_dict(image)

2630

2631 with self._build_resources(context, instance,

2632 requested_networks, security_groups, image_meta,

2633 block_device_mapping, provider_mapping,

2634 accel_uuids) as resources:

2635 instance.vm_state = vm_states.BUILDING

2636 instance.task_state = task_states.SPAWNING

2637 # NOTE(JoshNang) This also saves the changes to the

2638 # instance from _allocate_network_async, as they aren't

2639 # saved in that function to prevent races.

2640 instance.save(expected_task_state=

2641 task_states.BLOCK_DEVICE_MAPPING)

2642 block_device_info = resources['block_device_info']

2643 network_info = resources['network_info']

2644 accel_info = resources['accel_info']

2645 LOG.debug('Start spawning the instance on the hypervisor.',

2646 instance=instance)

2647 with timeutils.StopWatch() as timer:

2648 self.driver.spawn(context, instance, image_meta,

2649 injected_files, admin_password,

2650 allocs, network_info=network_info,

2651 block_device_info=block_device_info,

2652 accel_info=accel_info)

2653 LOG.info('Took %0.2f seconds to spawn the instance on '

2654 'the hypervisor.', timer.elapsed(),

2655 instance=instance)

2656 except (exception.InstanceNotFound,

2657 exception.UnexpectedDeletingTaskStateError) as e:

2658 with excutils.save_and_reraise_exception():

2659 self._notify_about_instance_usage(context, instance,

2660 'create.error', fault=e)

2661 compute_utils.notify_about_instance_create(

2662 context, instance, self.host,

2663 phase=fields.NotificationPhase.ERROR, exception=e,

2664 bdms=block_device_mapping)

2665 except exception.ComputeResourcesUnavailable as e:

2666 LOG.debug(e.format_message(), instance=instance)

2667 self._notify_about_instance_usage(context, instance,

2668 'create.error', fault=e)

2669 compute_utils.notify_about_instance_create(

2670 context, instance, self.host,

2671 phase=fields.NotificationPhase.ERROR, exception=e,

2672 bdms=block_device_mapping)

2673 raise exception.RescheduledException(

2674 instance_uuid=instance.uuid, reason=e.format_message())

2675 except exception.BuildAbortException as e:

2676 with excutils.save_and_reraise_exception():

2677 LOG.debug(e.format_message(), instance=instance)

2678 self._notify_about_instance_usage(context, instance,

2679 'create.error', fault=e)

2680 compute_utils.notify_about_instance_create(

2681 context, instance, self.host,

2682 phase=fields.NotificationPhase.ERROR, exception=e,

2683 bdms=block_device_mapping)

2684 except exception.NoMoreFixedIps as e:

2685 LOG.warning('No more fixed IP to be allocated',

2686 instance=instance)

2687 self._notify_about_instance_usage(context, instance,

2688 'create.error', fault=e)

2689 compute_utils.notify_about_instance_create(

2690 context, instance, self.host,

2691 phase=fields.NotificationPhase.ERROR, exception=e,

2692 bdms=block_device_mapping)

2693 msg = _('Failed to allocate the network(s) with error %s, '

2694 'not rescheduling.') % e.format_message()

2695 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2696 reason=msg)

2697 except (exception.ExternalNetworkAttachForbidden,

2698 exception.VirtualInterfaceCreateException,

2699 exception.VirtualInterfaceMacAddressException,

2700 exception.FixedIpInvalidOnHost,

2701 exception.UnableToAutoAllocateNetwork,

2702 exception.NetworksWithQoSPolicyNotSupported) as e:

2703 LOG.exception('Failed to allocate network(s)',

2704 instance=instance)

2705 self._notify_about_instance_usage(context, instance,

2706 'create.error', fault=e)

2707 compute_utils.notify_about_instance_create(

2708 context, instance, self.host,

2709 phase=fields.NotificationPhase.ERROR, exception=e,

2710 bdms=block_device_mapping)

2711 msg = _('Failed to allocate the network(s), not rescheduling.')

2712 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2713 reason=msg)

2714 except (exception.FlavorDiskTooSmall,

2715 exception.FlavorMemoryTooSmall,

2716 exception.ImageNotActive,

2717 exception.ImageUnacceptable,

2718 exception.InvalidDiskInfo,

2719 exception.InvalidDiskFormat,

2720 cursive_exception.SignatureVerificationError,

2721 exception.CertificateValidationFailed,

2722 exception.VolumeEncryptionNotSupported,

2723 exception.InvalidInput,

2724 # TODO(mriedem): We should be validating RequestedVRamTooHigh

2725 # in the API during server create and rebuild.

2726 exception.RequestedVRamTooHigh) as e:

2727 self._notify_about_instance_usage(context, instance,

2728 'create.error', fault=e)

2729 compute_utils.notify_about_instance_create(

2730 context, instance, self.host,

2731 phase=fields.NotificationPhase.ERROR, exception=e,

2732 bdms=block_device_mapping)

2733 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2734 reason=e.format_message())

2735 except exception.GroupAffinityViolation as e:

2736 LOG.exception('Failed to build and run instance',

2737 instance=instance)

2738 self._notify_about_instance_usage(context, instance,

2739 'create.error', fault=e)

2740 compute_utils.notify_about_instance_create(

2741 context, instance, self.host,

2742 phase=fields.NotificationPhase.ERROR, exception=e,

2743 bdms=block_device_mapping)

2744 raise exception.RescheduledByPolicyException(

2745 instance_uuid=instance.uuid, reason=str(e))

2746 except Exception as e:

2747 LOG.exception('Failed to build and run instance',

2748 instance=instance)

2749 self._notify_about_instance_usage(context, instance,

2750 'create.error', fault=e)

2751 compute_utils.notify_about_instance_create(

2752 context, instance, self.host,

2753 phase=fields.NotificationPhase.ERROR, exception=e,

2754 bdms=block_device_mapping)

2755 raise exception.RescheduledException(

2756 instance_uuid=instance.uuid, reason=str(e))

2757

2758 # NOTE(alaski): This is only useful during reschedules, remove it now.

2759 instance.system_metadata.pop('network_allocated', None)

2760

2761 # If CONF.default_access_ip_network_name is set, grab the

2762 # corresponding network and set the access ip values accordingly.

2763 network_name = CONF.default_access_ip_network_name

2764 if (network_name and not instance.access_ip_v4 and

2765 not instance.access_ip_v6):

2766 # Note that when there are multiple ips to choose from, an

2767 # arbitrary one will be chosen.

2768 for vif in network_info: 2768 ↛ 2777line 2768 didn't jump to line 2777 because the loop on line 2768 didn't complete

2769 if vif['network']['label'] == network_name: 2769 ↛ 2768line 2769 didn't jump to line 2768 because the condition on line 2769 was always true

2770 for ip in vif.fixed_ips():

2771 if not instance.access_ip_v4 and ip['version'] == 4:

2772 instance.access_ip_v4 = ip['address']

2773 if not instance.access_ip_v6 and ip['version'] == 6:

2774 instance.access_ip_v6 = ip['address']

2775 break

2776

2777 self._update_instance_after_spawn(instance)

2778

2779 try:

2780 instance.save(expected_task_state=task_states.SPAWNING)

2781 except (exception.InstanceNotFound,

2782 exception.UnexpectedDeletingTaskStateError) as e:

2783 with excutils.save_and_reraise_exception():

2784 self._notify_about_instance_usage(context, instance,

2785 'create.error', fault=e)

2786 compute_utils.notify_about_instance_create(

2787 context, instance, self.host,

2788 phase=fields.NotificationPhase.ERROR, exception=e,

2789 bdms=block_device_mapping)

2790

2791 self._update_scheduler_instance_info(context, instance)

2792 self._notify_about_instance_usage(context, instance, 'create.end',

2793 extra_usage_info={'message': _('Success')},

2794 network_info=network_info)

2795 compute_utils.notify_about_instance_create(context, instance,

2796 self.host, phase=fields.NotificationPhase.END,

2797 bdms=block_device_mapping)

2798

2799 def _build_resources_cleanup(self, instance, network_info):

2800 # Make sure the async call finishes

2801 if network_info is not None: 2801 ↛ 2805line 2801 didn't jump to line 2805 because the condition on line 2801 was always true

2802 network_info.wait(do_raise=False)

2803 self.driver.clean_networks_preparation(instance,

2804 network_info)

2805 self.driver.failed_spawn_cleanup(instance)

2806

2807 @contextlib.contextmanager

2808 def _build_resources(self, context, instance, requested_networks,

2809 security_groups, image_meta, block_device_mapping,

2810 resource_provider_mapping, accel_uuids):

2811 resources = {}

2812 network_info = None

2813 spec_arqs = {}

2814 network_arqs = {}

2815

2816 try:

2817 if accel_uuids:

2818 arqs = self._get_bound_arq_resources(

2819 context, instance, accel_uuids)

2820 spec_arqs, network_arqs = self._split_network_arqs(

2821 arqs, requested_networks)

2822 LOG.debug("ARQs for spec:%s, ARQs for network:%s",

2823 spec_arqs, network_arqs)

2824 except (Exception, eventlet.timeout.Timeout) as exc:

2825 LOG.exception(exc)

2826 # ARQs created for instance or ports.

2827 # The port binding isn't done yet.

2828 # Unbind port won't clean port ARQs.

2829 compute_utils.delete_arqs_if_needed(

2830 context, instance, accel_uuids)

2831 msg = _('Failure getting accelerator requests.')

2832 raise exception.BuildAbortException(

2833 reason=msg, instance_uuid=instance.uuid)

2834

2835 try:

2836 LOG.debug('Start building networks asynchronously for instance.',

2837 instance=instance)

2838 network_info = self._build_networks_for_instance(context, instance,

2839 requested_networks, security_groups,

2840 resource_provider_mapping, network_arqs)

2841 resources['network_info'] = network_info

2842 except (exception.InstanceNotFound,

2843 exception.UnexpectedDeletingTaskStateError):

2844 raise

2845 except exception.UnexpectedTaskStateError as e:

2846 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2847 reason=e.format_message())

2848 except Exception:

2849 # Because this allocation is async any failures are likely to occur

2850 # when the driver accesses network_info during spawn().

2851 LOG.exception('Failed to allocate network(s)',

2852 instance=instance)

2853 msg = _('Failed to allocate the network(s), not rescheduling.')

2854 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2855 reason=msg)

2856

2857 try:

2858 # Perform any driver preparation work for the driver.

2859 self.driver.prepare_for_spawn(instance)

2860

2861 # Depending on a virt driver, some network configuration is

2862 # necessary before preparing block devices.

2863 self.driver.prepare_networks_before_block_device_mapping(

2864 instance, network_info)

2865

2866 # Verify that all the BDMs have a device_name set and assign a

2867 # default to the ones missing it with the help of the driver.

2868 self._default_block_device_names(instance, image_meta,

2869 block_device_mapping)

2870

2871 LOG.debug('Start building block device mappings for instance.',

2872 instance=instance)

2873 instance.vm_state = vm_states.BUILDING

2874 instance.task_state = task_states.BLOCK_DEVICE_MAPPING

2875 instance.save()

2876

2877 block_device_info = self._prep_block_device(context, instance,

2878 block_device_mapping)

2879 resources['block_device_info'] = block_device_info

2880 except (exception.InstanceNotFound,

2881 exception.UnexpectedDeletingTaskStateError,

2882 exception.ComputeResourcesUnavailable):

2883 with excutils.save_and_reraise_exception():

2884 self._build_resources_cleanup(instance, network_info)

2885 except (exception.UnexpectedTaskStateError,

2886 exception.InstanceUnacceptable,

2887 exception.OverQuota, exception.InvalidBDM) as e:

2888 self._build_resources_cleanup(instance, network_info)

2889 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2890 reason=e.format_message())

2891 except Exception:

2892 LOG.exception('Failure prepping block device',

2893 instance=instance)

2894 self._build_resources_cleanup(instance, network_info)

2895 msg = _('Failure prepping block device.')

2896 raise exception.BuildAbortException(instance_uuid=instance.uuid,

2897 reason=msg)

2898

2899 resources['accel_info'] = list(spec_arqs.values())

2900 try:

2901 yield resources

2902 except Exception as exc:

2903 with excutils.save_and_reraise_exception() as ctxt:

2904 if not isinstance(exc, (

2905 exception.InstanceNotFound,

2906 exception.UnexpectedDeletingTaskStateError)):

2907 LOG.exception('Instance failed to spawn',

2908 instance=instance)

2909 # Make sure the async call finishes

2910 if network_info is not None: 2910 ↛ 2916line 2910 didn't jump to line 2916 because the condition on line 2910 was always true

2911 network_info.wait(do_raise=False)

2912 # if network_info is empty we're likely here because of

2913 # network allocation failure. Since nothing can be reused on

2914 # rescheduling it's better to deallocate network to eliminate

2915 # the chance of orphaned ports in neutron

2916 deallocate_networks = False if network_info else True

2917 try:

2918 self._shutdown_instance(context, instance,

2919 block_device_mapping, requested_networks,

2920 try_deallocate_networks=deallocate_networks)

2921 except Exception as exc2:

2922 ctxt.reraise = False

2923 LOG.warning('Could not clean up failed build,'

2924 ' not rescheduling. Error: %s',

2925 str(exc2))

2926 raise exception.BuildAbortException(

2927 instance_uuid=instance.uuid,

2928 reason=str(exc))

2929 finally:

2930 # Call Cyborg to delete accelerator requests

2931 if accel_uuids: 2931 ↛ 2935line 2931 didn't jump to line 2935 because the condition on line 2931 was never true

2932 # ARQs created for instance or ports.

2933 # Port bind may not successful.

2934 # unbind port won't clean port ARQs.

2935 compute_utils.delete_arqs_if_needed(

2936 context, instance, accel_uuids)

2937

2938 def _get_bound_arq_resources(self, context, instance, arq_uuids):

2939 """Get bound accelerator requests.

2940

2941 The ARQ binding was kicked off in the conductor as an async

2942 operation. Here we wait for the notification from Cyborg.

2943

2944 If the notification arrived before this point, which can happen

2945 in many/most cases (see [1]), it will be lost. To handle that,

2946 we use exit_wait_early.

2947 [1] https://review.opendev.org/#/c/631244/46/nova/compute/

2948 manager.py@2627

2949

2950 :param instance: instance object

2951 :param arq_uuids: List of accelerator request (ARQ) UUIDs.

2952 :returns: List of ARQs for which bindings have completed,

2953 successfully or otherwise

2954 """

2955

2956 cyclient = cyborg.get_client(context)

2957 if arq_uuids is None:

2958 arqs = cyclient.get_arqs_for_instance(instance.uuid)

2959 arq_uuids = [arq['uuid'] for arq in arqs]

2960 events = [('accelerator-request-bound', arq_uuid)

2961 for arq_uuid in arq_uuids]

2962

2963 timeout = CONF.arq_binding_timeout

2964 with self.virtapi.wait_for_instance_event(

2965 instance, events, deadline=timeout):

2966 resolved_arqs = cyclient.get_arqs_for_instance(

2967 instance.uuid, only_resolved=True)

2968 # Events for these resolved ARQs may have already arrived.

2969 # Such 'early' events need to be ignored.

2970 early_events = [('accelerator-request-bound', arq['uuid'])

2971 for arq in resolved_arqs]

2972 if early_events:

2973 self.virtapi.exit_wait_early(early_events)

2974

2975 # Since a timeout in wait_for_instance_event will raise, we get

2976 # here only if all binding events have been received.

2977 resolved_uuids = [arq['uuid'] for arq in resolved_arqs]

2978 if sorted(resolved_uuids) != sorted(arq_uuids):

2979 # Query Cyborg to get all.

2980 arqs = cyclient.get_arqs_for_instance(instance.uuid)

2981 else:

2982 arqs = resolved_arqs

2983 return arqs

2984

2985 def _split_network_arqs(self, arqs, requested_networks):

2986 """split arq request by extra spec from ARQ requested by port.

2987

2988 Return ARQ groups tuple:(spec_arqs, port_arqs)

2989 Each item in the tuple is a dict like:

2990 {

2991 arq1_uuid: arq1

2992 }

2993 """

2994 port_arqs = {}

2995 spec_arqs = {}

2996 port_arqs_uuids = [req_net.arq_uuid for req_net in requested_networks]

2997 for arq in arqs:

2998 if arq['uuid'] in port_arqs_uuids:

2999 port_arqs.update({arq['uuid']: arq})

3000 else:

3001 spec_arqs.update({arq['uuid']: arq})

3002

3003 return spec_arqs, port_arqs

3004

3005 def _cleanup_allocated_networks(self, context, instance,

3006 requested_networks):

3007 """Cleanup networks allocated for instance.

3008

3009 :param context: nova request context

3010 :param instance: nova.objects.instance.Instance object

3011 :param requested_networks: nova.objects.NetworkRequestList

3012 """

3013 LOG.debug('Unplugging VIFs for instance', instance=instance)

3014

3015 network_info = instance.get_network_info()

3016

3017 # NOTE(stephenfin) to avoid nova destroying the instance without

3018 # unplugging the interface, refresh network_info if it is empty.

3019 if not network_info: 3019 ↛ 3032line 3019 didn't jump to line 3032 because the condition on line 3019 was always true

3020 try:

3021 network_info = self.network_api.get_instance_nw_info(

3022 context, instance,

3023 )

3024 except Exception as exc:

3025 LOG.warning(

3026 'Failed to update network info cache when cleaning up '

3027 'allocated networks. Stale VIFs may be left on this host.'

3028 'Error: %s', str(exc)

3029 )

3030 return

3031

3032 try:

3033 self.driver.unplug_vifs(instance, network_info)

3034 except NotImplementedError:

3035 # This is an optional method so ignore things if it doesn't exist

3036 LOG.debug(

3037 'Virt driver does not provide unplug_vifs method, so it '

3038 'is not possible determine if VIFs should be unplugged.'

3039 )

3040 except Exception as exc:

3041 # It's possible that the instance never got as far as plugging

3042 # VIFs, in which case we would see an exception which can be

3043 # mostly ignored

3044 LOG.warning(

3045 'Cleaning up VIFs failed for instance. Error: %s',

3046 str(exc), instance=instance,

3047 )

3048 else:

3049 LOG.debug('Unplugged VIFs for instance', instance=instance)

3050

3051 try:

3052 self._deallocate_network(context, instance, requested_networks)

3053 except Exception:

3054 LOG.exception('Failed to deallocate networks', instance=instance)

3055 return

3056

3057 instance.system_metadata['network_allocated'] = 'False'

3058 try:

3059 instance.save()

3060 except exception.InstanceNotFound:

3061 # NOTE(alaski): It's possible that we're cleaning up the networks

3062 # because the instance was deleted. If that's the case then this

3063 # exception will be raised by instance.save()

3064 pass

3065

3066 def _try_deallocate_network(self, context, instance,

3067 requested_networks=None):

3068

3069 # During auto-scale cleanup, we could be deleting a large number

3070 # of servers at the same time and overloading parts of the system,

3071 # so we retry a few times in case of connection failures to the

3072 # networking service.

3073 @loopingcall.RetryDecorator(

3074 max_retry_count=3, inc_sleep_time=2, max_sleep_time=12,

3075 exceptions=(keystone_exception.connection.ConnectFailure,))

3076 def _deallocate_network_with_retries():

3077 try:

3078 self._deallocate_network(

3079 context, instance, requested_networks)

3080 except keystone_exception.connection.ConnectFailure as e:

3081 # Provide a warning that something is amiss.

3082 with excutils.save_and_reraise_exception():

3083 LOG.warning('Failed to deallocate network for instance; '

3084 'retrying. Error: %s', str(e),

3085 instance=instance)

3086

3087 try:

3088 # tear down allocated network structure

3089 _deallocate_network_with_retries()

3090 except Exception as ex:

3091 with excutils.save_and_reraise_exception():

3092 LOG.error('Failed to deallocate network for instance. '

3093 'Error: %s', ex, instance=instance)

3094 self._set_instance_obj_error_state(instance)

3095

3096 def _get_power_off_values(self, instance, clean_shutdown):

3097 """Get the timing configuration for powering down this instance."""

3098 if clean_shutdown:

3099 timeout = compute_utils.get_value_from_system_metadata(instance,

3100 key='image_os_shutdown_timeout', type=int,

3101 default=CONF.shutdown_timeout)

3102 retry_interval = CONF.compute.shutdown_retry_interval

3103 else:

3104 timeout = 0

3105 retry_interval = 0

3106

3107 return timeout, retry_interval

3108

3109 def _power_off_instance(self, context, instance, clean_shutdown=True):

3110 """Power off an instance on this host."""

3111 share_info = self._get_share_info(context, instance)

3112 timeout, retry_interval = self._get_power_off_values(

3113 instance, clean_shutdown)

3114 self.driver.power_off(instance, timeout, retry_interval)

3115 share_info.deactivate_all()

3116 self._umount_all_shares(context, instance, share_info)

3117

3118 def _shutdown_instance(self, context, instance,

3119 bdms, requested_networks=None, notify=True,

3120 try_deallocate_networks=True):

3121 """Shutdown an instance on this host.

3122

3123 :param:context: security context

3124 :param:instance: a nova.objects.Instance object

3125 :param:bdms: the block devices for the instance to be torn

3126 down

3127 :param:requested_networks: the networks on which the instance

3128 has ports

3129 :param:notify: true if a final usage notification should be

3130 emitted

3131 :param:try_deallocate_networks: false if we should avoid

3132 trying to teardown networking

3133 """

3134 context = context.elevated()

3135 LOG.info('Terminating instance', instance=instance)

3136

3137 if notify:

3138 self._notify_about_instance_usage(context, instance,

3139 "shutdown.start")

3140 compute_utils.notify_about_instance_action(context, instance,

3141 self.host, action=fields.NotificationAction.SHUTDOWN,

3142 phase=fields.NotificationPhase.START, bdms=bdms)

3143

3144 network_info = instance.get_network_info()

3145

3146 share_info = self._get_share_info(

3147 context, instance, check_status=False

3148 )

3149

3150 # NOTE(arnaudmorin) to avoid nova destroying the instance without

3151 # unplugging the interface, refresh network_info if it is empty.

3152 if not network_info:

3153 network_info = self.network_api.get_instance_nw_info(

3154 context, instance)

3155

3156 # NOTE(vish) get bdms before destroying the instance

3157 vol_bdms = [bdm for bdm in bdms if bdm.is_volume]

3158 block_device_info = self._get_instance_block_device_info(

3159 context, instance, bdms=bdms)

3160

3161 # NOTE(melwitt): attempt driver destroy before releasing ip, may

3162 # want to keep ip allocated for certain failures

3163 try:

3164 LOG.debug('Start destroying the instance on the hypervisor.',

3165 instance=instance)

3166 with timeutils.StopWatch() as timer:

3167 self.driver.destroy(context, instance, network_info,

3168 block_device_info)

3169 LOG.info('Took %0.2f seconds to destroy the instance on the '

3170 'hypervisor.', timer.elapsed(), instance=instance)

3171 except exception.InstancePowerOffFailure:

3172 # if the instance can't power off, don't release the ip

3173 with excutils.save_and_reraise_exception():

3174 pass

3175 except Exception:

3176 with excutils.save_and_reraise_exception():

3177 # deallocate ip and fail without proceeding to

3178 # volume api calls, preserving current behavior

3179 if try_deallocate_networks: 3179 ↛ 3183line 3179 didn't jump to line 3183

3180 self._try_deallocate_network(context, instance,

3181 requested_networks)

3182

3183 if try_deallocate_networks:

3184 self._try_deallocate_network(context, instance, requested_networks)

3185

3186 timer.restart()

3187

3188 for share in share_info:

3189 # If we fail umounting or denying the share we may have a

3190 # dangling share_mapping in the DB (share_mapping entry with an

3191 # instance that does not exist anymore).

3192 try:

3193 self._umount_share(context, instance, share)

3194 share.deactivate()

3195 self.deny_share(context, instance, share)

3196 except (

3197 exception.ShareUmountError,

3198 exception.ShareNotFound,

3199 exception.ShareAccessNotFound,

3200 exception.ShareAccessRemovalError,

3201 ):

3202 LOG.warning("An error occurred while unmounting or "

3203 "denying the share '%s'. This error is ignored to "

3204 "proceed with instance removal. "

3205 "Consequently, there may be a dangling "

3206 "share_mapping.", share.share_id)

3207

3208 connector = None

3209 for bdm in vol_bdms:

3210 try:

3211 if bdm.attachment_id:

3212 self.volume_api.attachment_delete(context,

3213 bdm.attachment_id)

3214 else:

3215 # NOTE(vish): actual driver detach done in driver.destroy,

3216 # so just tell cinder that we are done with it.

3217 if connector is None:

3218 connector = self.driver.get_volume_connector(instance)

3219 self.volume_api.terminate_connection(context,

3220 bdm.volume_id,

3221 connector)

3222 self.volume_api.detach(context, bdm.volume_id,

3223 instance.uuid)

3224

3225 except exception.VolumeAttachmentNotFound as exc:

3226 LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc,

3227 instance=instance)

3228 except exception.DiskNotFound as exc:

3229 LOG.debug('Ignoring DiskNotFound: %s', exc,

3230 instance=instance)

3231 except exception.VolumeNotFound as exc:

3232 LOG.debug('Ignoring VolumeNotFound: %s', exc,

3233 instance=instance)

3234 except (cinder_exception.EndpointNotFound,

3235 keystone_exception.EndpointNotFound) as exc:

3236 LOG.warning('Ignoring EndpointNotFound for '

3237 'volume %(volume_id)s: %(exc)s',

3238 {'exc': exc, 'volume_id': bdm.volume_id},

3239 instance=instance)

3240 except cinder_exception.ClientException as exc:

3241 LOG.warning('Ignoring unknown cinder exception for '

3242 'volume %(volume_id)s: %(exc)s',

3243 {'exc': exc, 'volume_id': bdm.volume_id},

3244 instance=instance)

3245 except Exception as exc:

3246 LOG.warning('Ignoring unknown exception for '

3247 'volume %(volume_id)s: %(exc)s',

3248 {'exc': exc, 'volume_id': bdm.volume_id},

3249 instance=instance)

3250 if vol_bdms:

3251 LOG.info('Took %(time).2f seconds to detach %(num)s volumes '

3252 'for instance.',

3253 {'time': timer.elapsed(), 'num': len(vol_bdms)},

3254 instance=instance)

3255

3256 if notify:

3257 self._notify_about_instance_usage(context, instance,

3258 "shutdown.end")

3259 compute_utils.notify_about_instance_action(context, instance,

3260 self.host, action=fields.NotificationAction.SHUTDOWN,

3261 phase=fields.NotificationPhase.END, bdms=bdms)

3262

3263 def _cleanup_volumes(self, context, instance, bdms, raise_exc=True,

3264 detach=True):

3265 original_exception = None

3266 for bdm in bdms:

3267 if detach and bdm.volume_id:

3268 try:

3269 LOG.debug("Detaching volume: %s", bdm.volume_id,

3270 instance_uuid=instance.uuid)

3271 destroy = bdm.delete_on_termination

3272 self._detach_volume(context, bdm, instance,

3273 destroy_bdm=destroy)

3274 except Exception as exc:

3275 original_exception = exc

3276 LOG.warning('Failed to detach volume: %(volume_id)s '

3277 'due to %(exc)s',

3278 {'volume_id': bdm.volume_id, 'exc': exc})

3279

3280 if bdm.volume_id and bdm.delete_on_termination:

3281 try:

3282 LOG.debug("Deleting volume: %s", bdm.volume_id,

3283 instance_uuid=instance.uuid)

3284 self.volume_api.delete(context, bdm.volume_id)

3285 except Exception as exc:

3286 original_exception = exc

3287 LOG.warning('Failed to delete volume: %(volume_id)s '

3288 'due to %(exc)s',

3289 {'volume_id': bdm.volume_id, 'exc': exc})

3290 if original_exception is not None and raise_exc:

3291 raise original_exception

3292

3293 def _delete_instance(self, context, instance, bdms):

3294 """Delete an instance on this host.

3295

3296 :param context: nova request context

3297 :param instance: nova.objects.instance.Instance object

3298 :param bdms: nova.objects.block_device.BlockDeviceMappingList object

3299 """

3300 events = self.instance_events.clear_events_for_instance(instance)

3301 if events: 3301 ↛ 3302line 3301 didn't jump to line 3302 because the condition on line 3301 was never true

3302 LOG.debug('Events pending at deletion: %(events)s',

3303 {'events': ','.join(events.keys())},

3304 instance=instance)

3305 self._notify_about_instance_usage(context, instance,

3306 "delete.start")

3307 compute_utils.notify_about_instance_action(context, instance,

3308 self.host, action=fields.NotificationAction.DELETE,

3309 phase=fields.NotificationPhase.START, bdms=bdms)

3310

3311 self._shutdown_instance(context, instance, bdms)

3312

3313 # NOTE(vish): We have already deleted the instance, so we have

3314 # to ignore problems cleaning up the volumes. It

3315 # would be nice to let the user know somehow that

3316 # the volume deletion failed, but it is not

3317 # acceptable to have an instance that can not be

3318 # deleted. Perhaps this could be reworked in the

3319 # future to set an instance fault the first time

3320 # and to only ignore the failure if the instance

3321 # is already in ERROR.

3322

3323 # NOTE(ameeda): The volumes have already been detached during

3324 # the above _shutdown_instance() call and this is

3325 # why detach is not requested from

3326 # _cleanup_volumes() in this case

3327

3328 self._cleanup_volumes(context, instance, bdms,

3329 raise_exc=False, detach=False)

3330 # Delete Cyborg ARQs if the instance has a device profile.

3331 compute_utils.delete_arqs_if_needed(context, instance)

3332 # if a delete task succeeded, always update vm state and task

3333 # state without expecting task state to be DELETING

3334 instance.vm_state = vm_states.DELETED

3335 instance.task_state = None

3336 instance.power_state = power_state.NOSTATE

3337 instance.terminated_at = timeutils.utcnow()

3338 instance.save()

3339

3340 self._complete_deletion(context, instance)

3341 # only destroy the instance in the db if the _complete_deletion

3342 # doesn't raise and therefore allocation is successfully

3343 # deleted in placement

3344 instance.destroy()

3345

3346 self._notify_about_instance_usage(context, instance, "delete.end")

3347 compute_utils.notify_about_instance_action(context, instance,

3348 self.host, action=fields.NotificationAction.DELETE,

3349 phase=fields.NotificationPhase.END, bdms=bdms)

3350

3351 @wrap_exception()

3352 @reverts_task_state

3353 @wrap_instance_event(prefix='compute')

3354 @wrap_instance_fault

3355 def terminate_instance(self, context, instance, bdms):

3356 """Terminate an instance on this host."""

3357 @utils.synchronized(instance.uuid)

3358 def do_terminate_instance(instance, bdms):

3359 # NOTE(mriedem): If we are deleting the instance while it was

3360 # booting from volume, we could be racing with a database update of

3361 # the BDM volume_id. Since the compute API passes the BDMs over RPC

3362 # to compute here, the BDMs may be stale at this point. So check

3363 # for any volume BDMs that don't have volume_id set and if we

3364 # detect that, we need to refresh the BDM list before proceeding.

3365 # TODO(mriedem): Move this into _delete_instance and make the bdms

3366 # parameter optional.

3367 for bdm in list(bdms):

3368 if bdm.is_volume and not bdm.volume_id:

3369 LOG.debug('There are potentially stale BDMs during '

3370 'delete, refreshing the BlockDeviceMappingList.',

3371 instance=instance)

3372 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

3373 context, instance.uuid)

3374 break

3375 try:

3376 self._delete_instance(context, instance, bdms)

3377 except exception.InstanceNotFound:

3378 LOG.info("Instance disappeared during terminate",

3379 instance=instance)

3380 except Exception:

3381 # As we're trying to delete always go to Error if something

3382 # goes wrong that _delete_instance can't handle.

3383 with excutils.save_and_reraise_exception():

3384 LOG.exception('Setting instance vm_state to ERROR',

3385 instance=instance)

3386 self._set_instance_obj_error_state(instance)

3387

3388 do_terminate_instance(instance, bdms)

3389

3390 # NOTE(johannes): This is probably better named power_off_instance

3391 # so it matches the driver method, but because of other issues, we

3392 # can't use that name in grizzly.

3393 @wrap_exception()

3394 @reverts_task_state

3395 @wrap_instance_event(prefix='compute')

3396 @wrap_instance_fault

3397 def stop_instance(self, context, instance, clean_shutdown):

3398 """Stopping an instance on this host."""

3399

3400 @utils.synchronized(instance.uuid)

3401 def do_stop_instance():

3402 current_power_state = self._get_power_state(instance)

3403

3404 LOG.debug('Stopping instance; current vm_state: %(vm_state)s, '

3405 'current task_state: %(task_state)s, current DB '

3406 'power_state: %(db_power_state)s, current VM '

3407 'power_state: %(current_power_state)s',

3408 {'vm_state': instance.vm_state,

3409 'task_state': instance.task_state,

3410 'db_power_state': instance.power_state,

3411 'current_power_state': current_power_state},

3412 instance_uuid=instance.uuid)

3413

3414 # NOTE(mriedem): If the instance is already powered off, we are

3415 # possibly tearing down and racing with other operations, so we can

3416 # expect the task_state to be None if something else updates the

3417 # instance and we're not locking it.

3418 expected_task_state = [task_states.POWERING_OFF]

3419 # The list of power states is from _sync_instance_power_state.

3420 if current_power_state in (power_state.NOSTATE,

3421 power_state.SHUTDOWN,

3422 power_state.CRASHED):

3423 LOG.info('Instance is already powered off in the '

3424 'hypervisor when stop is called.',

3425 instance=instance)

3426 expected_task_state.append(None)

3427

3428 self._notify_about_instance_usage(context, instance,

3429 "power_off.start")

3430

3431 compute_utils.notify_about_instance_action(context, instance,

3432 self.host, action=fields.NotificationAction.POWER_OFF,

3433 phase=fields.NotificationPhase.START)

3434

3435 self._power_off_instance(context, instance, clean_shutdown)

3436 instance.power_state = self._get_power_state(instance)

3437 instance.vm_state = vm_states.STOPPED

3438 instance.task_state = None

3439 instance.save(expected_task_state=expected_task_state)

3440 self._notify_about_instance_usage(context, instance,

3441 "power_off.end")

3442

3443 compute_utils.notify_about_instance_action(context, instance,

3444 self.host, action=fields.NotificationAction.POWER_OFF,

3445 phase=fields.NotificationPhase.END)

3446

3447 do_stop_instance()

3448

3449 def _power_on(self, context, instance):

3450 network_info = self.network_api.get_instance_nw_info(context, instance)

3451 block_device_info = self._get_instance_block_device_info(context,

3452 instance)

3453 accel_info = self._get_accel_info(context, instance)

3454

3455 share_info = self._get_share_info(context, instance)

3456

3457 self._mount_all_shares(context, instance, share_info)

3458 self.driver.power_on(context, instance,

3459 network_info,

3460 block_device_info, accel_info, share_info)

3461 share_info.activate_all()

3462

3463 def _delete_snapshot_of_shelved_instance(self, context, instance,

3464 snapshot_id):

3465 """Delete snapshot of shelved instance."""

3466 try:

3467 self.image_api.delete(context, snapshot_id)

3468 except (exception.ImageNotFound,

3469 exception.ImageNotAuthorized) as exc:

3470 LOG.warning("Failed to delete snapshot "

3471 "from shelved instance (%s).",

3472 exc.format_message(), instance=instance)

3473 except Exception:

3474 LOG.exception("Something wrong happened when trying to "

3475 "delete snapshot from shelved instance.",

3476 instance=instance)

3477

3478 # NOTE(johannes): This is probably better named power_on_instance

3479 # so it matches the driver method, but because of other issues, we

3480 # can't use that name in grizzly.

3481 @wrap_exception()

3482 @reverts_task_state

3483 @wrap_instance_event(prefix='compute')

3484 @wrap_instance_fault

3485 def start_instance(self, context, instance):

3486 """Starting an instance on this host."""

3487 self._notify_about_instance_usage(context, instance, "power_on.start")

3488 compute_utils.notify_about_instance_action(context, instance,

3489 self.host, action=fields.NotificationAction.POWER_ON,

3490 phase=fields.NotificationPhase.START)

3491 self._power_on(context, instance)

3492 instance.power_state = self._get_power_state(instance)

3493 instance.vm_state = vm_states.ACTIVE

3494 instance.task_state = None

3495

3496 # Delete an image(VM snapshot) for a shelved instance

3497 snapshot_id = instance.system_metadata.get('shelved_image_id')

3498 if snapshot_id:

3499 self._delete_snapshot_of_shelved_instance(context, instance,

3500 snapshot_id)

3501

3502 # Delete system_metadata for a shelved instance

3503 compute_utils.remove_shelved_keys_from_system_metadata(instance)

3504

3505 instance.save(expected_task_state=task_states.POWERING_ON)

3506 self._notify_about_instance_usage(context, instance, "power_on.end")

3507 compute_utils.notify_about_instance_action(context, instance,

3508 self.host, action=fields.NotificationAction.POWER_ON,

3509 phase=fields.NotificationPhase.END)

3510

3511 @messaging.expected_exceptions(NotImplementedError,

3512 exception.TriggerCrashDumpNotSupported,

3513 exception.InstanceNotRunning)

3514 @wrap_exception()

3515 @wrap_instance_event(prefix='compute')

3516 @wrap_instance_fault

3517 def trigger_crash_dump(self, context, instance):

3518 """Trigger crash dump in an instance."""

3519

3520 self._notify_about_instance_usage(context, instance,

3521 "trigger_crash_dump.start")

3522 compute_utils.notify_about_instance_action(context, instance,

3523 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,

3524 phase=fields.NotificationPhase.START)

3525

3526 # This method does not change task_state and power_state because the

3527 # effect of a trigger depends on user's configuration.

3528 self.driver.trigger_crash_dump(instance)

3529

3530 self._notify_about_instance_usage(context, instance,

3531 "trigger_crash_dump.end")

3532 compute_utils.notify_about_instance_action(context, instance,

3533 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP,

3534 phase=fields.NotificationPhase.END)

3535

3536 @wrap_exception()

3537 @reverts_task_state

3538 @wrap_instance_event(prefix='compute')

3539 @wrap_instance_fault

3540 def soft_delete_instance(self, context, instance):

3541 """Soft delete an instance on this host."""

3542 with compute_utils.notify_about_instance_delete(

3543 self.notifier, context, instance, 'soft_delete',

3544 source=fields.NotificationSource.COMPUTE):

3545 try:

3546 self.driver.soft_delete(instance)

3547 except NotImplementedError:

3548 # Fallback to just powering off the instance if the

3549 # hypervisor doesn't implement the soft_delete method

3550 self._power_off_instance(

3551 context, instance, clean_shutdown=False

3552 )

3553 instance.power_state = self._get_power_state(instance)

3554 instance.vm_state = vm_states.SOFT_DELETED

3555 instance.task_state = None

3556 instance.save(expected_task_state=[task_states.SOFT_DELETING])

3557

3558 @wrap_exception()

3559 @reverts_task_state

3560 @wrap_instance_event(prefix='compute')

3561 @wrap_instance_fault

3562 def restore_instance(self, context, instance):

3563 """Restore a soft-deleted instance on this host."""

3564 self._notify_about_instance_usage(context, instance, "restore.start")

3565 compute_utils.notify_about_instance_action(context, instance,

3566 self.host, action=fields.NotificationAction.RESTORE,

3567 phase=fields.NotificationPhase.START)

3568 try:

3569 self.driver.restore(instance)

3570 except NotImplementedError:

3571 # Fallback to just powering on the instance if the hypervisor

3572 # doesn't implement the restore method

3573 self._power_on(context, instance)

3574 instance.power_state = self._get_power_state(instance)

3575 instance.vm_state = vm_states.ACTIVE

3576 instance.task_state = None

3577 instance.save(expected_task_state=task_states.RESTORING)

3578 self._notify_about_instance_usage(context, instance, "restore.end")

3579 compute_utils.notify_about_instance_action(context, instance,

3580 self.host, action=fields.NotificationAction.RESTORE,

3581 phase=fields.NotificationPhase.END)

3582

3583 @staticmethod

3584 def _set_migration_status(migration, status):

3585 """Set the status, and guard against a None being passed in.

3586

3587 This is useful as some of the compute RPC calls will not pass

3588 a migration object in older versions. The check can be removed when

3589 we move past 4.x major version of the RPC API.

3590 """

3591 if migration:

3592 migration.status = status

3593 migration.save()

3594

3595 @staticmethod

3596 def _reimage_failed_callback(event_name, instance):

3597 msg = ('Cinder reported failure during reimaging '

3598 'with %(event)s for instance %(uuid)s')

3599 msg_args = {'event': event_name, 'uuid': instance.uuid}

3600 LOG.error(msg, msg_args)

3601 raise exception.ReimageException(msg % msg_args)

3602

3603 def _detach_root_volume(self, context, instance, root_bdm):

3604 volume_id = root_bdm.volume_id

3605 mp = root_bdm.device_name

3606 old_connection_info = jsonutils.loads(root_bdm.connection_info)

3607 try:

3608 self.driver.detach_volume(context, old_connection_info,

3609 instance, root_bdm.device_name)

3610 except exception.DiskNotFound as err:

3611 LOG.warning('Ignoring DiskNotFound exception while '

3612 'detaching volume %(volume_id)s from '

3613 '%(mp)s : %(err)s',

3614 {'volume_id': volume_id, 'mp': mp,

3615 'err': err}, instance=instance)

3616 except exception.DeviceDetachFailed:

3617 with excutils.save_and_reraise_exception():

3618 LOG.warning('Guest refused to detach volume %(vol)s',

3619 {'vol': volume_id}, instance=instance)

3620 self.volume_api.roll_detaching(context, volume_id)

3621 except Exception:

3622 with excutils.save_and_reraise_exception():

3623 LOG.exception('Failed to detach volume '

3624 '%(volume_id)s from %(mp)s',

3625 {'volume_id': volume_id, 'mp': mp},

3626 instance=instance)

3627 self.volume_api.roll_detaching(context, volume_id)

3628

3629 def _rebuild_volume_backed_instance(self, context, instance, bdms,

3630 image_id):

3631 # Get root bdm and attachment ID associated to it

3632 root_bdm = compute_utils.get_root_bdm(context, instance, bdms)

3633 old_attachment_id = root_bdm.attachment_id

3634

3635 # Create a new attachment and delete the previous attachment

3636 # We create a new attachment first to keep the volume in

3637 # reserved state after old attachment is deleted and avoid any

3638 # races in between the attachment create and delete.

3639 attachment_id = None

3640 try:

3641 attachment_id = self.volume_api.attachment_create(

3642 context, root_bdm.volume_id, instance.uuid)['id']

3643 self._detach_root_volume(context, instance, root_bdm)

3644 root_bdm.attachment_id = attachment_id

3645 root_bdm.save()

3646 self.volume_api.attachment_delete(context,

3647 old_attachment_id)

3648 except exception.InstanceNotFound:

3649 # This means we failed to save the new attachment because

3650 # the instance is deleted, so (try to) delete it and abort.

3651 try:

3652 self.volume_api.attachment_delete(context,

3653 attachment_id)

3654 except cinder_exception.ClientException:

3655 LOG.error('Failed to delete new attachment %s',

3656 attachment_id)

3657 msg = _('Failed to rebuild volume backed instance.')

3658 raise exception.BuildAbortException(

3659 instance_uuid=instance.uuid, reason=msg)

3660 except cinder_exception.ClientException:

3661 if attachment_id:

3662 LOG.error('Failed to delete old attachment %s',

3663 old_attachment_id)

3664 else:

3665 LOG.error('Failed to create new attachment')

3666 msg = _('Failed to rebuild volume backed instance.')

3667 raise exception.BuildAbortException(

3668 instance_uuid=instance.uuid, reason=msg)

3669 events = [('volume-reimaged', root_bdm.volume_id)]

3670

3671 # Get the image requested for rebuild

3672 try:

3673 image = self.image_api.get(context, image_id)

3674 except exception.ImageNotFound:

3675 msg = _('Image %s not found.') % image_id

3676 LOG.error(msg)

3677 raise exception.BuildAbortException(

3678 instance_uuid=instance.uuid, reason=msg)

3679 image_size = int(math.ceil(float(image.get('size')) / units.Gi))

3680 deadline = CONF.reimage_timeout_per_gb * image_size

3681 error_cb = self._reimage_failed_callback

3682

3683 # Call cinder to perform reimage operation and wait until an

3684 # external event is triggered.

3685 try:

3686 with self.virtapi.wait_for_instance_event(instance, events,

3687 deadline=deadline,

3688 error_callback=error_cb):

3689 self.volume_api.reimage_volume(

3690 context, root_bdm.volume_id, image_id,

3691 reimage_reserved=True)

3692

3693 except Exception as ex:

3694 LOG.error('Failed to rebuild volume backed instance: %s',

3695 str(ex), instance=instance)

3696 msg = _('Failed to rebuild volume backed instance.')

3697 raise exception.BuildAbortException(

3698 instance_uuid=instance.uuid, reason=msg)

3699

3700 def _rebuild_default_impl(

3701 self, context, instance, image_meta, injected_files,

3702 admin_password, allocations, bdms, detach_block_devices,

3703 attach_block_devices, network_info=None, evacuate=False,

3704 block_device_info=None, preserve_ephemeral=False,

3705 accel_uuids=None, reimage_boot_volume=False):

3706 if preserve_ephemeral: 3706 ↛ 3709line 3706 didn't jump to line 3709 because the condition on line 3706 was never true

3707 # The default code path does not support preserving ephemeral

3708 # partitions.

3709 raise exception.PreserveEphemeralNotSupported()

3710

3711 accel_info = []

3712 detach_root_bdm = not reimage_boot_volume

3713 if evacuate:

3714 if instance.flavor.extra_specs.get('accel:device_profile'): 3714 ↛ 3715line 3714 didn't jump to line 3715 because the condition on line 3714 was never true

3715 try:

3716 accel_info = self._get_bound_arq_resources(

3717 context, instance, accel_uuids or [])

3718 except (Exception, eventlet.timeout.Timeout) as exc:

3719 LOG.exception(exc)

3720 self._build_resources_cleanup(instance, network_info)

3721 msg = _('Failure getting accelerator resources.')

3722 raise exception.BuildAbortException(

3723 instance_uuid=instance.uuid, reason=msg)

3724 detach_block_devices(context, bdms,

3725 detach_root_bdm=detach_root_bdm)

3726 else:

3727 self._power_off_instance(context, instance, clean_shutdown=True)

3728 detach_block_devices(context, bdms,

3729 detach_root_bdm=detach_root_bdm)

3730 if reimage_boot_volume:

3731 # Previously, the calls reaching here were for image

3732 # backed instance rebuild and didn't have a root bdm

3733 # so now we need to handle the case for root bdm.

3734 # For the root BDM, we are doing attach/detach operations

3735 # manually as we want to maintain a 'reserved' state

3736 # throughout the reimage process from the cinder side so

3737 # we are excluding the root BDM from certain operations

3738 # here i.e. deleting it's mapping before the destroy call.

3739 block_device_info_copy = copy.deepcopy(block_device_info)

3740 root_bdm = compute_utils.get_root_bdm(context, instance, bdms)

3741 mapping = block_device_info_copy["block_device_mapping"]

3742 # drop root bdm from the mapping

3743 mapping = [

3744 bdm for bdm in mapping

3745 if bdm["volume_id"] != root_bdm.volume_id

3746 ]

3747 self.driver.destroy(context, instance,

3748 network_info=network_info,

3749 block_device_info=block_device_info_copy)

3750 else:

3751 self.driver.destroy(context, instance,

3752 network_info=network_info,

3753 block_device_info=block_device_info)

3754 try:

3755 accel_info = self._get_accel_info(context, instance)

3756 except Exception as exc:

3757 LOG.exception(exc)

3758 self._build_resources_cleanup(instance, network_info)

3759 msg = _('Failure getting accelerator resources.')

3760 raise exception.BuildAbortException(

3761 instance_uuid=instance.uuid, reason=msg)

3762 if reimage_boot_volume:

3763 is_volume_backed = compute_utils.is_volume_backed_instance(

3764 context, instance, bdms)

3765 if is_volume_backed:

3766 self._rebuild_volume_backed_instance(

3767 context, instance, bdms, image_meta.id)

3768

3769 instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING

3770 instance.save(expected_task_state=[task_states.REBUILDING])

3771

3772 new_block_device_info = attach_block_devices(context, instance, bdms)

3773

3774 instance.task_state = task_states.REBUILD_SPAWNING

3775 instance.save(

3776 expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING])

3777

3778 with instance.mutated_migration_context():

3779 self.driver.spawn(context, instance, image_meta, injected_files,

3780 admin_password, allocations,

3781 network_info=network_info,

3782 block_device_info=new_block_device_info,

3783 accel_info=accel_info)

3784

3785 def _notify_instance_rebuild_error(self, context, instance, error, bdms):

3786 self._notify_about_instance_usage(context, instance,

3787 'rebuild.error', fault=error)

3788 compute_utils.notify_about_instance_rebuild(

3789 context, instance, self.host,

3790 phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms)

3791

3792 @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported,

3793 exception.BuildAbortException)

3794 @wrap_exception()

3795 @reverts_task_state

3796 @wrap_instance_event(prefix='compute')

3797 @wrap_instance_fault

3798 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,

3799 injected_files, new_pass, orig_sys_metadata,

3800 bdms, recreate, on_shared_storage,

3801 preserve_ephemeral, migration,

3802 scheduled_node, limits, request_spec, accel_uuids,

3803 reimage_boot_volume=None, target_state=None):

3804 """Destroy and re-make this instance.

3805

3806 A 'rebuild' effectively purges all existing data from the system and

3807 remakes the VM with given 'metadata' and 'personalities'.

3808

3809 :param context: `nova.RequestContext` object

3810 :param instance: Instance object

3811 :param orig_image_ref: Original image_ref before rebuild

3812 :param image_ref: New image_ref for rebuild

3813 :param injected_files: Files to inject

3814 :param new_pass: password to set on rebuilt instance

3815 :param orig_sys_metadata: instance system metadata from pre-rebuild

3816 :param bdms: block-device-mappings to use for rebuild

3817 :param recreate: True if the instance is being evacuated (e.g. the

3818 hypervisor it was on failed) - cleanup of old state will be

3819 skipped.

3820 :param on_shared_storage: True if instance files on shared storage.

3821 If not provided then information from the

3822 driver will be used to decide if the instance

3823 files are available or not on the target host

3824 :param preserve_ephemeral: True if the default ephemeral storage

3825 partition must be preserved on rebuild

3826 :param migration: a Migration object if one was created for this

3827 rebuild operation (if it's a part of evacuate)

3828 :param scheduled_node: A node of the host chosen by the scheduler. If a

3829 host was specified by the user, this will be

3830 None

3831 :param limits: Overcommit limits set by the scheduler. If a host was

3832 specified by the user, this will be None

3833 :param request_spec: a RequestSpec object used to schedule the instance

3834 :param accel_uuids: a list of cyborg ARQ uuids

3835 :param reimage_boot_volume: Boolean to specify whether the user has

3836 explicitly requested to rebuild a boot

3837 volume or None if RPC version is <=6.0

3838 :param target_state: Set a target state for the evacuated instance or

3839 None if RPC version is <=6.1.

3840

3841 """

3842 # recreate=True means the instance is being evacuated from a failed

3843 # host to a new destination host (this host). The 'recreate' variable

3844 # name is confusing, so rename it to evacuate here at the top, which

3845 # is simpler than renaming a parameter in an RPC versioned method.

3846 evacuate = recreate

3847 context = context.elevated()

3848

3849 if evacuate:

3850 LOG.info("Evacuating instance", instance=instance)

3851 else:

3852 LOG.info("Rebuilding instance", instance=instance)

3853

3854 if evacuate:

3855 # This is an evacuation to a new host, so we need to perform a

3856 # resource claim.

3857 rebuild_claim = self.rt.rebuild_claim

3858 else:

3859 # This is a rebuild to the same host, so we don't need to make

3860 # a claim since the instance is already on this host.

3861 rebuild_claim = claims.NopClaim

3862

3863 if image_ref:

3864 image_meta = objects.ImageMeta.from_image_ref(

3865 context, self.image_api, image_ref)

3866 elif evacuate:

3867 # For evacuate the API does not send down the image_ref since the

3868 # image does not change so just get it from what was stashed in

3869 # the instance system_metadata when the instance was created (or

3870 # last rebuilt). This also works for volume-backed instances.

3871 image_meta = instance.image_meta

3872 else:

3873 image_meta = objects.ImageMeta()

3874

3875 # NOTE(mriedem): On an evacuate, we need to update

3876 # the instance's host and node properties to reflect it's

3877 # destination node for the evacuate.

3878 if not scheduled_node:

3879 if evacuate:

3880 try:

3881 compute_node = self._get_compute_info(context, self.host)

3882 scheduled_node = compute_node.hypervisor_hostname

3883 except exception.ComputeHostNotFound as e:

3884 # This means we were asked to rebuild one of our own

3885 # instances, or another instance as a target of an

3886 # evacuation, but we are unable to find a matching compute

3887 # node.

3888 LOG.exception('Failed to get compute_info for %s',

3889 self.host)

3890 self._set_migration_status(migration, 'failed')

3891 self._notify_instance_rebuild_error(context, instance, e,

3892 bdms)

3893 raise exception.InstanceFaultRollback(

3894 inner_exception=exception.BuildAbortException(

3895 instance_uuid=instance.uuid,

3896 reason=e.format_message()))

3897

3898 else:

3899 scheduled_node = instance.node

3900

3901 allocs = self.reportclient.get_allocations_for_consumer(

3902 context, instance.uuid)

3903

3904 # If the resource claim or group policy validation fails before we

3905 # do anything to the guest or its networking/volumes we want to keep

3906 # the current status rather than put the instance into ERROR status.

3907 instance_state = instance.vm_state

3908 with self._error_out_instance_on_exception(

3909 context, instance, instance_state=instance_state):

3910 try:

3911 self._do_rebuild_instance_with_claim(

3912 context, instance, orig_image_ref,

3913 image_meta, injected_files, new_pass, orig_sys_metadata,

3914 bdms, evacuate, on_shared_storage, preserve_ephemeral,

3915 migration, request_spec, allocs, rebuild_claim,

3916 scheduled_node, limits, accel_uuids, reimage_boot_volume,

3917 target_state)

3918 except (exception.ComputeResourcesUnavailable,

3919 exception.RescheduledException) as e:

3920 if isinstance(e, exception.ComputeResourcesUnavailable):

3921 LOG.debug("Could not rebuild instance on this host, not "

3922 "enough resources available.", instance=instance)

3923 else:

3924 # RescheduledException is raised by the late server group

3925 # policy check during evacuation if a parallel scheduling

3926 # violated the policy.

3927 # We catch the RescheduledException here but we don't have

3928 # the plumbing to do an actual reschedule so we abort the

3929 # operation.

3930 LOG.debug("Could not rebuild instance on this host, "

3931 "late server group check failed.",

3932 instance=instance)

3933 # NOTE(ndipanov): We just abort the build for now and leave a

3934 # migration record for potential cleanup later

3935 self._set_migration_status(migration, 'failed')

3936 # Since the claim failed, we need to remove the allocation

3937 # created against the destination node. Note that we can only

3938 # get here when evacuating to a destination node. Rebuilding

3939 # on the same host (not evacuate) uses the NopClaim which will

3940 # not raise ComputeResourcesUnavailable.

3941 self.rt.delete_allocation_for_evacuated_instance(

3942 context, instance, scheduled_node, node_type='destination')

3943 self._notify_instance_rebuild_error(context, instance, e, bdms)

3944 # Wrap this in InstanceFaultRollback so that the

3945 # _error_out_instance_on_exception context manager keeps the

3946 # vm_state unchanged.

3947 raise exception.InstanceFaultRollback(

3948 inner_exception=exception.BuildAbortException(

3949 instance_uuid=instance.uuid,

3950 reason=e.format_message()))

3951 except (exception.InstanceNotFound,

3952 exception.UnexpectedDeletingTaskStateError) as e:

3953 LOG.debug('Instance was deleted while rebuilding',

3954 instance=instance)

3955 self._set_migration_status(migration, 'failed')

3956 self._notify_instance_rebuild_error(context, instance, e, bdms)

3957 except Exception as e:

3958 self._set_migration_status(migration, 'failed')

3959 if evacuate or scheduled_node is not None:

3960 self.rt.delete_allocation_for_evacuated_instance(

3961 context, instance, scheduled_node,

3962 node_type='destination')

3963 self._notify_instance_rebuild_error(context, instance, e, bdms)

3964 raise

3965 else:

3966 # NOTE(gibi): Let the resource tracker set the instance

3967 # host and drop the migration context as we need to hold the

3968 # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with

3969 # _update_available_resources. See bug 1896463.

3970 self.rt.finish_evacuation(instance, scheduled_node, migration)

3971

3972 def _do_rebuild_instance_with_claim(

3973 self, context, instance, orig_image_ref, image_meta,

3974 injected_files, new_pass, orig_sys_metadata, bdms, evacuate,

3975 on_shared_storage, preserve_ephemeral, migration, request_spec,

3976 allocations, rebuild_claim, scheduled_node, limits, accel_uuids,

3977 reimage_boot_volume, target_state):

3978 """Helper to avoid deep nesting in the top-level method."""

3979

3980 provider_mapping = None

3981 if evacuate:

3982 provider_mapping = self._get_request_group_mapping(request_spec)

3983

3984 if provider_mapping:

3985 compute_utils.update_pci_request_with_placement_allocations(

3986 context,

3987 self.reportclient,

3988 instance.pci_requests.requests,

3989 provider_mapping,

3990 )

3991

3992 claim_context = rebuild_claim(

3993 context, instance, scheduled_node, allocations,

3994 limits=limits, image_meta=image_meta, migration=migration)

3995

3996 with claim_context:

3997 self._do_rebuild_instance(

3998 context, instance, orig_image_ref, image_meta, injected_files,

3999 new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage,

4000 preserve_ephemeral, migration, request_spec, allocations,

4001 provider_mapping, accel_uuids, reimage_boot_volume,

4002 target_state)

4003

4004 @staticmethod

4005 def _get_image_name(image_meta):

4006 if image_meta.obj_attr_is_set("name"):

4007 return image_meta.name

4008 else:

4009 return ''

4010

4011 def _do_rebuild_instance(

4012 self, context, instance, orig_image_ref, image_meta,

4013 injected_files, new_pass, orig_sys_metadata, bdms, evacuate,

4014 on_shared_storage, preserve_ephemeral, migration, request_spec,

4015 allocations, request_group_resource_providers_mapping,

4016 accel_uuids, reimage_boot_volume, target_state):

4017 orig_vm_state = instance.vm_state

4018

4019 if evacuate:

4020 if target_state and orig_vm_state != vm_states.ERROR: 4020 ↛ 4023line 4020 didn't jump to line 4023 because the condition on line 4020 was never true

4021 # This will ensure that at destination the instance will have

4022 # the desired state.

4023 if target_state not in vm_states.ALLOW_TARGET_STATES:

4024 raise exception.InstanceEvacuateNotSupportedTargetState(

4025 target_state=target_state)

4026 orig_vm_state = target_state

4027

4028 if request_spec:

4029 # NOTE(gibi): Do a late check of server group policy as

4030 # parallel scheduling could violate such policy. This will

4031 # cause the evacuate to fail as rebuild does not implement

4032 # reschedule.

4033 hints = self._get_scheduler_hints({}, request_spec)

4034 self._validate_instance_group_policy(context, instance, hints)

4035

4036 if not self.driver.capabilities.get("supports_evacuate", False):

4037 raise exception.InstanceEvacuateNotSupported

4038

4039 self._check_instance_exists(instance)

4040

4041 if on_shared_storage is None:

4042 LOG.debug('on_shared_storage is not provided, using driver '

4043 'information to decide if the instance needs to '

4044 'be evacuated')

4045 on_shared_storage = self.driver.instance_on_disk(instance)

4046

4047 elif (on_shared_storage !=

4048 self.driver.instance_on_disk(instance)):

4049 # To cover case when admin expects that instance files are

4050 # on shared storage, but not accessible and vice versa

4051 raise exception.InvalidSharedStorage(

4052 _("Invalid state of instance files on shared"

4053 " storage"))

4054

4055 if on_shared_storage:

4056 LOG.info('disk on shared storage, evacuating using'

4057 ' existing disk')

4058 elif instance.image_ref:

4059 orig_image_ref = instance.image_ref

4060 LOG.info("disk not on shared storage, evacuating from "

4061 "image: '%s'", str(orig_image_ref))

4062 else:

4063 LOG.info('disk on volume, evacuating using existing '

4064 'volume')

4065

4066 # We check trusted certs capabilities for both evacuate (rebuild on

4067 # another host) and rebuild (rebuild on the same host) because for

4068 # evacuate we need to make sure an instance with trusted certs can

4069 # have the image verified with those certs during rebuild, and for

4070 # rebuild we could be rebuilding a server that started out with no

4071 # trusted certs on this host, and then was rebuilt with trusted certs

4072 # for a new image, in which case we need to validate that new image

4073 # with the trusted certs during the rebuild.

4074 self._check_trusted_certs(instance)

4075

4076 # This instance.exists message should contain the original

4077 # image_ref, not the new one. Since the DB has been updated

4078 # to point to the new one... we have to override it.

4079 orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref,

4080 context)

4081 extra_usage_info = {'image_ref_url': orig_image_ref_url}

4082 compute_utils.notify_usage_exists(

4083 self.notifier, context, instance, self.host,

4084 current_period=True, system_metadata=orig_sys_metadata,

4085 extra_usage_info=extra_usage_info)

4086

4087 # This message should contain the new image_ref

4088 extra_usage_info = {'image_name': self._get_image_name(image_meta)}

4089 self._notify_about_instance_usage(context, instance,

4090 "rebuild.start", extra_usage_info=extra_usage_info)

4091 # NOTE: image_name is not included in the versioned notification

4092 # because we already provide the image_uuid in the notification

4093 # payload and the image details can be looked up via the uuid.

4094 compute_utils.notify_about_instance_rebuild(

4095 context, instance, self.host,

4096 phase=fields.NotificationPhase.START,

4097 bdms=bdms)

4098

4099 instance.power_state = self._get_power_state(instance)

4100 instance.task_state = task_states.REBUILDING

4101 instance.save(expected_task_state=[task_states.REBUILDING])

4102

4103 if evacuate:

4104 self.network_api.setup_networks_on_host(

4105 context, instance, self.host)

4106 # For nova-network this is needed to move floating IPs

4107 # For neutron this updates the host in the port binding

4108 # TODO(cfriesen): this network_api call and the one above

4109 # are so similar, we should really try to unify them.

4110 self.network_api.setup_instance_network_on_host(

4111 context, instance, self.host, migration,

4112 provider_mappings=request_group_resource_providers_mapping)

4113 # TODO(mriedem): Consider decorating setup_instance_network_on_host

4114 # with @api.refresh_cache and then we wouldn't need this explicit

4115 # call to get_instance_nw_info.

4116 network_info = self.network_api.get_instance_nw_info(context,

4117 instance)

4118 else:

4119 network_info = instance.get_network_info()

4120

4121 if bdms is None:

4122 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

4123 context, instance.uuid)

4124

4125 block_device_info = \

4126 self._get_instance_block_device_info(

4127 context, instance, bdms=bdms)

4128

4129 def detach_block_devices(context, bdms, detach_root_bdm=True):

4130 for bdm in bdms:

4131 # Previously, the calls made to this method by rebuild

4132 # instance operation were for image backed instances which

4133 # assumed we only had attached volumes and no root BDM.

4134 # Now we need to handle case for root BDM which we are

4135 # doing manually so skipping the attachment create/delete

4136 # calls from here.

4137 # The detach_root_bdm parameter is only passed while

4138 # rebuilding the volume backed instance so we don't have

4139 # to worry about other callers as they won't satisfy this

4140 # condition.

4141 # For evacuate case, we have detach_root_bdm always True

4142 # since we don't have reimage_boot_volume parameter in

4143 # this case so this will not be executed.

4144 if not detach_root_bdm and bdm.is_root: 4144 ↛ 4145line 4144 didn't jump to line 4145 because the condition on line 4144 was never true

4145 continue

4146 if bdm.is_volume: 4146 ↛ 4130line 4146 didn't jump to line 4130 because the condition on line 4146 was always true

4147 # NOTE (ildikov): Having the attachment_id set in the BDM

4148 # means that it's the new Cinder attach/detach flow

4149 # (available from v3.44). In that case we explicitly

4150 # attach and detach the volumes through attachment level

4151 # operations. In this scenario _detach_volume will delete

4152 # the existing attachment which would make the volume

4153 # status change to 'available' if we don't pre-create

4154 # another empty attachment before deleting the old one.

4155 attachment_id = None

4156 if bdm.attachment_id: 4156 ↛ 4157line 4156 didn't jump to line 4157 because the condition on line 4156 was never true

4157 attachment_id = self.volume_api.attachment_create(

4158 context, bdm['volume_id'], instance.uuid)['id']

4159 self._detach_volume(context, bdm, instance,

4160 destroy_bdm=False)

4161 if attachment_id: 4161 ↛ 4162line 4161 didn't jump to line 4162 because the condition on line 4161 was never true

4162 bdm.attachment_id = attachment_id

4163 bdm.save()

4164

4165 files = self._decode_files(injected_files)

4166

4167 kwargs = dict(

4168 context=context,

4169 instance=instance,

4170 image_meta=image_meta,

4171 injected_files=files,

4172 admin_password=new_pass,

4173 allocations=allocations,

4174 bdms=bdms,

4175 detach_block_devices=detach_block_devices,

4176 attach_block_devices=self._prep_block_device,

4177 block_device_info=block_device_info,

4178 network_info=network_info,

4179 preserve_ephemeral=preserve_ephemeral,

4180 evacuate=evacuate,

4181 accel_uuids=accel_uuids,

4182 reimage_boot_volume=reimage_boot_volume)

4183 try:

4184 with instance.mutated_migration_context():

4185 self.driver.rebuild(**kwargs)

4186 except NotImplementedError:

4187 # NOTE(rpodolyaka): driver doesn't provide specialized version

4188 # of rebuild, fall back to the default implementation

4189 self._rebuild_default_impl(**kwargs)

4190 self._update_instance_after_spawn(instance)

4191 instance.save(expected_task_state=[task_states.REBUILD_SPAWNING])

4192

4193 if orig_vm_state == vm_states.STOPPED:

4194 LOG.info("bringing vm to original state: '%s'",

4195 orig_vm_state, instance=instance)

4196 instance.vm_state = vm_states.ACTIVE

4197 instance.task_state = task_states.POWERING_OFF

4198 instance.progress = 0

4199 instance.save()

4200 self.stop_instance(context, instance, False)

4201 # TODO(melwitt): We should clean up instance console tokens here in the

4202 # case of evacuate. The instance is on a new host and will need to

4203 # establish a new console connection.

4204 self._update_scheduler_instance_info(context, instance)

4205 self._notify_about_instance_usage(

4206 context, instance, "rebuild.end",

4207 network_info=network_info,

4208 extra_usage_info=extra_usage_info)

4209 compute_utils.notify_about_instance_rebuild(

4210 context, instance, self.host,

4211 phase=fields.NotificationPhase.END,

4212 bdms=bdms)

4213

4214 def _handle_bad_volumes_detached(self, context, instance, bad_devices,

4215 block_device_info):

4216 """Handle cases where the virt-layer had to detach non-working volumes

4217 in order to complete an operation.

4218 """

4219 for bdm in block_device_info['block_device_mapping']:

4220 if bdm.get('mount_device') in bad_devices:

4221 try:

4222 volume_id = bdm['connection_info']['data']['volume_id']

4223 except KeyError:

4224 continue

4225

4226 # NOTE(sirp): ideally we'd just call

4227 # `compute_api.detach_volume` here but since that hits the

4228 # DB directly, that's off limits from within the

4229 # compute-manager.

4230 #

4231 # API-detach

4232 LOG.info("Detaching from volume api: %s", volume_id)

4233 self.volume_api.begin_detaching(context, volume_id)

4234

4235 # Manager-detach

4236 self.detach_volume(context, volume_id, instance)

4237

4238 def _get_accel_info(self, context, instance):

4239 dp_name = instance.flavor.extra_specs.get('accel:device_profile')

4240 if dp_name:

4241 cyclient = cyborg.get_client(context)

4242 accel_info = cyclient.get_arqs_for_instance(instance.uuid)

4243 else:

4244 accel_info = []

4245 return accel_info

4246

4247 def _delete_dangling_bdms(self, context, instance, bdms):

4248 """Deletes dangling or stale attachments for volume from

4249 Nova and Cinder DB so both service DBs can be in sync.

4250

4251 Retrieves volume attachments from the Nova block_device_mapping

4252 table and verifies them with the Cinder volume_attachment table.

4253 If attachment is not present in any one of the DBs, delete

4254 attachments from the other DB.

4255

4256 :param context: The nova request context.

4257 :param instance: instance object.

4258 :param bdms: BlockDeviceMappingList list object.

4259 """

4260

4261 try:

4262 cinder_attachments = self.volume_api.attachment_get_all(

4263 context, instance.uuid)

4264 except (keystone_exception.EndpointNotFound,

4265 cinder_exception.ClientException):

4266 # if cinder is not deployed we never need to check for

4267 # attachments as there cannot be dangling bdms.

4268 # if we cannot connect to cinder we cannot check for dangling

4269 # bdms so we skip the check. Intermittent connection issues

4270 # to cinder should not cause instance reboot to fail.

4271 return

4272

4273 # attachments present in nova DB, ones nova knows about

4274 nova_attachments = []

4275 bdms_to_delete = []

4276 for bdm in bdms.objects:

4277 if bdm.volume_id and bdm.attachment_id: 4277 ↛ 4276line 4277 didn't jump to line 4276 because the condition on line 4277 was always true

4278 try:

4279 self.volume_api.attachment_get(context, bdm.attachment_id)

4280 except exception.VolumeAttachmentNotFound:

4281 LOG.info(

4282 f"Removing stale volume attachment "

4283 f"'{bdm.attachment_id}' from instance for "

4284 f"volume '{bdm.volume_id}'.", instance=instance)

4285 bdm.destroy()

4286 bdms_to_delete.append(bdm)

4287 else:

4288 nova_attachments.append(bdm.attachment_id)

4289

4290 cinder_attachments = [each['id'] for each in cinder_attachments]

4291

4292 if len(set(cinder_attachments) - set(nova_attachments)):

4293 LOG.info(

4294 "Removing stale volume attachments of instance from "

4295 "Cinder", instance=instance)

4296 for each_attach in set(cinder_attachments) - set(nova_attachments):

4297 # delete only cinder known attachments, from cinder DB.

4298 LOG.debug(

4299 f"Removing attachment '{each_attach}'", instance=instance)

4300 self.volume_api.attachment_delete(context, each_attach)

4301

4302 # refresh bdms object

4303 for bdm in bdms_to_delete:

4304 bdms.objects.remove(bdm)

4305

4306 def _get_share_info(self, context, instance, check_status=True):

4307 share_info = objects.ShareMappingList(context)

4308

4309 for share_mapping in objects.ShareMappingList.get_by_instance_uuid(

4310 context, instance.uuid

4311 ):

4312 share_info.objects.append(share_mapping)

4313

4314 if check_status:

4315 fsm = fields.ShareMappingStatus

4316 if (

4317 share_mapping.status == fsm.ATTACHING or

4318 share_mapping.status == fsm.DETACHING

4319 ):

4320 # If the share status is attaching it means we are racing

4321 # with the compute node. The mount is not completed yet or

4322 # something really bad happened. So we set the instance in

4323 # error state.

4324 LOG.error(

4325 "Share id '%s' attached to server id '%s' is "

4326 "still in '%s' state. Setting the instance "

4327 "in error.",

4328 share_mapping.share_id,

4329 instance.id,

4330 share_mapping.status,

4331 )

4332 self._set_instance_obj_error_state(

4333 instance, clean_task_state=True

4334 )

4335 raise exception.ShareErrorUnexpectedStatus(

4336 share_id=share_mapping.share_id,

4337 instance_uuid=instance.id,

4338 )

4339

4340 if share_mapping.status == fsm.ERROR:

4341 LOG.warning(

4342 "Share id '%s' attached to server id '%s' is in "

4343 "error state.",

4344 share_mapping.share_id,

4345 instance.id

4346 )

4347

4348 return share_info

4349

4350 @wrap_exception()

4351 @reverts_task_state

4352 @wrap_instance_event(prefix='compute')

4353 @wrap_instance_fault

4354 def reboot_instance(self, context, instance, block_device_info,

4355 reboot_type):

4356 @utils.synchronized(instance.uuid)

4357 def do_reboot_instance(context, instance, block_device_info,

4358 reboot_type):

4359 self._reboot_instance(context, instance, block_device_info,

4360 reboot_type)

4361 do_reboot_instance(context, instance, block_device_info, reboot_type)

4362

4363 def _reboot_instance(self, context, instance, block_device_info,

4364 reboot_type):

4365 """Reboot an instance on this host."""

4366 # acknowledge the request made it to the manager

4367 if reboot_type == "SOFT":

4368 instance.task_state = task_states.REBOOT_PENDING

4369 expected_states = task_states.soft_reboot_states

4370 else:

4371 instance.task_state = task_states.REBOOT_PENDING_HARD

4372 expected_states = task_states.hard_reboot_states

4373

4374 context = context.elevated()

4375 LOG.info("Rebooting instance", instance=instance)

4376

4377 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

4378 context, instance.uuid)

4379

4380 self._delete_dangling_bdms(context, instance, bdms)

4381

4382 block_device_info = self._get_instance_block_device_info(

4383 context, instance, bdms=bdms)

4384

4385 network_info = self.network_api.get_instance_nw_info(context, instance)

4386

4387 accel_info = self._get_accel_info(context, instance)

4388

4389 share_info = self._get_share_info(context, instance)

4390

4391 self._notify_about_instance_usage(context, instance, "reboot.start")

4392 compute_utils.notify_about_instance_action(

4393 context, instance, self.host,

4394 action=fields.NotificationAction.REBOOT,

4395 phase=fields.NotificationPhase.START,

4396 bdms=bdms

4397 )

4398

4399 instance.power_state = self._get_power_state(instance)

4400 instance.save(expected_task_state=expected_states)

4401

4402 if instance.power_state != power_state.RUNNING:

4403 state = instance.power_state

4404 running = power_state.RUNNING

4405 LOG.warning('trying to reboot a non-running instance:'

4406 ' (state: %(state)s expected: %(running)s)',

4407 {'state': state, 'running': running},

4408 instance=instance)

4409

4410 def bad_volumes_callback(bad_devices):

4411 self._handle_bad_volumes_detached(

4412 context, instance, bad_devices, block_device_info)

4413

4414 try:

4415 # Don't change it out of rescue mode

4416 if instance.vm_state == vm_states.RESCUED: 4416 ↛ 4417line 4416 didn't jump to line 4417 because the condition on line 4416 was never true

4417 new_vm_state = vm_states.RESCUED

4418 else:

4419 new_vm_state = vm_states.ACTIVE

4420 new_power_state = None

4421 if reboot_type == "SOFT":

4422 instance.task_state = task_states.REBOOT_STARTED

4423 expected_state = task_states.REBOOT_PENDING

4424 else:

4425 instance.task_state = task_states.REBOOT_STARTED_HARD

4426 expected_state = task_states.REBOOT_PENDING_HARD

4427 instance.save(expected_task_state=expected_state)

4428

4429 # Attempt to mount the shares again.

4430 # Note: The API ref states that soft reboot can only be

4431 # done if the instance is in ACTIVE state. If the instance

4432 # is in ACTIVE state it cannot have a share_mapping in ERROR

4433 # so it is safe to ignore the re-mounting of the share for

4434 # soft reboot.

4435 if reboot_type == "HARD":

4436 self._mount_all_shares(context, instance, share_info)

4437

4438 self.driver.reboot(context, instance,

4439 network_info,

4440 reboot_type,

4441 block_device_info=block_device_info,

4442 accel_info=accel_info,

4443 share_info=share_info,

4444 bad_volumes_callback=bad_volumes_callback)

4445 share_info.activate_all()

4446

4447 except Exception as error:

4448 with excutils.save_and_reraise_exception() as ctxt:

4449 exc_info = sys.exc_info()

4450 # if the reboot failed but the VM is running don't

4451 # put it into an error state

4452 new_power_state = self._get_power_state(instance)

4453 if new_power_state == power_state.RUNNING:

4454 LOG.warning('Reboot failed but instance is running',

4455 instance=instance)

4456 compute_utils.add_instance_fault_from_exc(context,

4457 instance, error, exc_info)

4458 self._notify_about_instance_usage(context, instance,

4459 'reboot.error', fault=error)

4460 compute_utils.notify_about_instance_action(

4461 context, instance, self.host,

4462 action=fields.NotificationAction.REBOOT,

4463 phase=fields.NotificationPhase.ERROR,

4464 exception=error, bdms=bdms

4465 )

4466 ctxt.reraise = False

4467 else:

4468 LOG.error('Cannot reboot instance: %s', error,

4469 instance=instance)

4470 self._set_instance_obj_error_state(instance)

4471

4472 if not new_power_state:

4473 new_power_state = self._get_power_state(instance)

4474 try:

4475 instance.power_state = new_power_state

4476 instance.vm_state = new_vm_state

4477 instance.task_state = None

4478 instance.save()

4479 except exception.InstanceNotFound:

4480 LOG.warning("Instance disappeared during reboot",

4481 instance=instance)

4482

4483 self._notify_about_instance_usage(context, instance, "reboot.end")

4484 compute_utils.notify_about_instance_action(

4485 context, instance, self.host,

4486 action=fields.NotificationAction.REBOOT,

4487 phase=fields.NotificationPhase.END,

4488 bdms=bdms

4489 )

4490

4491 @delete_image_on_error

4492 def _do_snapshot_instance(self, context, image_id, instance):

4493 self._snapshot_instance(context, image_id, instance,

4494 task_states.IMAGE_BACKUP)

4495

4496 @wrap_exception()

4497 @reverts_task_state

4498 @wrap_instance_event(prefix='compute')

4499 @wrap_instance_fault

4500 def backup_instance(self, context, image_id, instance, backup_type,

4501 rotation):

4502 """Backup an instance on this host.

4503

4504 :param backup_type: daily | weekly

4505 :param rotation: int representing how many backups to keep around

4506 """

4507 self._do_snapshot_instance(context, image_id, instance)

4508 self._rotate_backups(context, instance, backup_type, rotation)

4509

4510 @wrap_exception()

4511 @reverts_task_state

4512 @wrap_instance_event(prefix='compute')

4513 @wrap_instance_fault

4514 @delete_image_on_error

4515 def snapshot_instance(self, context, image_id, instance):

4516 """Snapshot an instance on this host.

4517

4518 :param context: security context

4519 :param image_id: glance.db.sqlalchemy.models.Image.Id

4520 :param instance: a nova.objects.instance.Instance object

4521 """

4522 # NOTE(dave-mcnally) the task state will already be set by the api

4523 # but if the compute manager has crashed/been restarted prior to the

4524 # request getting here the task state may have been cleared so we set

4525 # it again and things continue normally

4526 try:

4527 instance.task_state = task_states.IMAGE_SNAPSHOT

4528 instance.save(

4529 expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING)

4530 except exception.InstanceNotFound:

4531 # possibility instance no longer exists, no point in continuing

4532 LOG.debug("Instance not found, could not set state %s "

4533 "for instance.",

4534 task_states.IMAGE_SNAPSHOT, instance=instance)

4535 return

4536

4537 except exception.UnexpectedDeletingTaskStateError:

4538 LOG.debug("Instance being deleted, snapshot cannot continue",

4539 instance=instance)

4540 return

4541

4542 with self._snapshot_semaphore:

4543 self._snapshot_instance(context, image_id, instance,

4544 task_states.IMAGE_SNAPSHOT)

4545

4546 def _snapshot_instance(self, context, image_id, instance,

4547 expected_task_state):

4548 context = context.elevated()

4549

4550 instance.power_state = self._get_power_state(instance)

4551 try:

4552 instance.save()

4553

4554 LOG.info('instance snapshotting', instance=instance)

4555

4556 if instance.power_state != power_state.RUNNING: 4556 ↛ 4557line 4556 didn't jump to line 4557 because the condition on line 4556 was never true

4557 state = instance.power_state

4558 running = power_state.RUNNING

4559 LOG.warning('trying to snapshot a non-running instance: '

4560 '(state: %(state)s expected: %(running)s)',

4561 {'state': state, 'running': running},

4562 instance=instance)

4563

4564 self._notify_about_instance_usage(

4565 context, instance, "snapshot.start")

4566 compute_utils.notify_about_instance_snapshot(context, instance,

4567 self.host, phase=fields.NotificationPhase.START,

4568 snapshot_image_id=image_id)

4569

4570 def update_task_state(task_state,

4571 expected_state=expected_task_state):

4572 instance.task_state = task_state

4573 instance.save(expected_task_state=expected_state)

4574

4575 with timeutils.StopWatch() as timer:

4576 self.driver.snapshot(context, instance, image_id,

4577 update_task_state)

4578 LOG.info('Took %0.2f seconds to snapshot the instance on '

4579 'the hypervisor.', timer.elapsed(), instance=instance)

4580

4581 instance.task_state = None

4582 instance.save(expected_task_state=task_states.IMAGE_UPLOADING)

4583

4584 self._notify_about_instance_usage(context, instance,

4585 "snapshot.end")

4586 compute_utils.notify_about_instance_snapshot(context, instance,

4587 self.host, phase=fields.NotificationPhase.END,

4588 snapshot_image_id=image_id)

4589 except (exception.InstanceNotFound,

4590 exception.InstanceNotRunning,

4591 exception.UnexpectedDeletingTaskStateError):

4592 # the instance got deleted during the snapshot

4593 # Quickly bail out of here

4594 msg = 'Instance disappeared during snapshot'

4595 LOG.debug(msg, instance=instance)

4596 try:

4597 image = self.image_api.get(context, image_id)

4598 if image['status'] != 'active':

4599 self.image_api.delete(context, image_id)

4600 except exception.ImageNotFound:

4601 LOG.debug('Image not found during clean up %s', image_id)

4602 except Exception:

4603 LOG.warning("Error while trying to clean up image %s",

4604 image_id, instance=instance)

4605 except exception.ImageNotFound:

4606 instance.task_state = None

4607 instance.save()

4608 LOG.warning("Image not found during snapshot", instance=instance)

4609

4610 @messaging.expected_exceptions(NotImplementedError)

4611 @wrap_exception()

4612 def volume_snapshot_create(self, context, instance, volume_id,

4613 create_info):

4614 try:

4615 self.driver.volume_snapshot_create(context, instance, volume_id,

4616 create_info)

4617 except exception.InstanceNotRunning:

4618 # Libvirt driver can raise this exception

4619 LOG.debug('Instance disappeared during volume snapshot create',

4620 instance=instance)

4621

4622 @messaging.expected_exceptions(NotImplementedError)

4623 @wrap_exception()

4624 def volume_snapshot_delete(self, context, instance, volume_id,

4625 snapshot_id, delete_info):

4626 try:

4627 self.driver.volume_snapshot_delete(context, instance, volume_id,

4628 snapshot_id, delete_info)

4629 except exception.InstanceNotRunning:

4630 # Libvirt driver can raise this exception

4631 LOG.debug('Instance disappeared during volume snapshot delete',

4632 instance=instance)

4633

4634 @messaging.expected_exceptions(NotImplementedError)

4635 @wrap_exception()

4636 @wrap_instance_event(prefix='compute')

4637 @wrap_instance_fault

4638 def allow_share(self, context, instance, share_mapping):

4639

4640 @utils.synchronized(share_mapping.share_id)

4641 def _allow_share(context, instance, share_mapping):

4642 def _apply_policy():

4643 # self.manila_api.lock(share_mapping.share_id)

4644 # Explicitly locking the share is not needed as

4645 # create_access_rule() from the sdk will do it if the

4646 # lock_visibility and lock_deletion flags are passed

4647 self.manila_api.allow(

4648 context,

4649 share_mapping.share_id,

4650 share_mapping.access_type,

4651 share_mapping.access_to,

4652 "rw",

4653 )

4654

4655 def _wait_policy_to_be_applied():

4656 # Ensure the share policy is updated, this will avoid

4657 # a race condition mounting the share if it is not the case.

4658 max_retries = CONF.manila.share_apply_policy_timeout

4659 attempt_count = 0

4660 while attempt_count < max_retries: 4660 ↛ 4680line 4660 didn't jump to line 4680 because the condition on line 4660 was always true

4661 if self.manila_api.has_access( 4661 ↛ 4673line 4661 didn't jump to line 4673 because the condition on line 4661 was always true

4662 context,

4663 share_mapping.share_id,

4664 share_mapping.access_type,

4665 share_mapping.access_to,

4666 ):

4667 LOG.debug(

4668 "Allow policy set on share %s ",

4669 share_mapping.share_id,

4670 )

4671 break

4672 else:

4673 LOG.debug(

4674 "Waiting policy to be set on share %s ",

4675 share_mapping.share_id,

4676 )

4677 time.sleep(1)

4678 attempt_count += 1

4679

4680 if attempt_count >= max_retries: 4680 ↛ 4681line 4680 didn't jump to line 4681 because the condition on line 4680 was never true

4681 raise exception.ShareAccessGrantError(

4682 share_id=share_mapping.share_id,

4683 reason="Failed to set allow policy on share, "

4684 "too many retries",

4685 )

4686

4687 try:

4688 compute_utils.notify_about_share_attach_detach(

4689 context,

4690 instance,

4691 instance.host,

4692 action=fields.NotificationAction.SHARE_ATTACH,

4693 phase=fields.NotificationPhase.START,

4694 share_id=share_mapping.share_id

4695 )

4696

4697 share_mapping.set_access_according_to_protocol()

4698

4699 if not self.manila_api.has_access( 4699 ↛ 4709line 4699 didn't jump to line 4709 because the condition on line 4699 was always true

4700 context,

4701 share_mapping.share_id,

4702 share_mapping.access_type,

4703 share_mapping.access_to,

4704 ):

4705 _apply_policy()

4706 _wait_policy_to_be_applied()

4707

4708 # Set the share from attaching to inactive

4709 self._set_share_mapping_status(

4710 share_mapping, fields.ShareMappingStatus.INACTIVE

4711 )

4712

4713 compute_utils.notify_about_share_attach_detach(

4714 context,

4715 instance,

4716 instance.host,

4717 action=fields.NotificationAction.SHARE_ATTACH,

4718 phase=fields.NotificationPhase.END,

4719 share_id=share_mapping.share_id

4720 )

4721

4722 except (

4723 exception.ShareNotFound,

4724 exception.ShareProtocolNotSupported,

4725 exception.ShareAccessGrantError,

4726 ) as e:

4727 self._set_share_mapping_status(

4728 share_mapping, fields.ShareMappingStatus.ERROR

4729 )

4730 compute_utils.notify_about_share_attach_detach(

4731 context,

4732 instance,

4733 instance.host,

4734 action=fields.NotificationAction.SHARE_ATTACH,

4735 phase=fields.NotificationPhase.ERROR,

4736 share_id=share_mapping.share_id,

4737 exception=e

4738 )

4739 LOG.error(e.format_message())

4740 raise

4741 except (

4742 sdk_exc.BadRequestException,

4743 ) as e:

4744 self._set_share_mapping_status(

4745 share_mapping, fields.ShareMappingStatus.ERROR

4746 )

4747 compute_utils.notify_about_share_attach_detach(

4748 context,

4749 instance,

4750 instance.host,

4751 action=fields.NotificationAction.SHARE_ATTACH,

4752 phase=fields.NotificationPhase.ERROR,

4753 share_id=share_mapping.share_id,

4754 exception=e

4755 )

4756 LOG.error(

4757 "%s: %s error from url: %s, %s",

4758 e.message,

4759 e.source,

4760 e.url,

4761 e.details,

4762 )

4763 raise

4764 except keystone_exception.http.Unauthorized as e:

4765 self._set_share_mapping_status(

4766 share_mapping, fields.ShareMappingStatus.ERROR

4767 )

4768 compute_utils.notify_about_share_attach_detach(

4769 context,

4770 instance,

4771 instance.host,

4772 action=fields.NotificationAction.SHARE_ATTACH,

4773 phase=fields.NotificationPhase.ERROR,

4774 share_id=share_mapping.share_id,

4775 exception=e

4776 )

4777 LOG.error(e)

4778 raise

4779

4780 _allow_share(context, instance, share_mapping)

4781

4782 @messaging.expected_exceptions(NotImplementedError)

4783 @wrap_exception()

4784 @wrap_instance_event(prefix='compute')

4785 @wrap_instance_fault

4786 def deny_share(self, context, instance, share_mapping):

4787

4788 @utils.synchronized(share_mapping.share_id)

4789 def _deny_share(context, instance, share_mapping):

4790

4791 def check_share_usage(context, instance_uuid):

4792 share_mappings_used_by_share = (

4793 objects.share_mapping.ShareMappingList.get_by_share_id(

4794 context, share_mapping.share_id

4795 )

4796 )

4797

4798 # Logic explanation:

4799 #

4800 # Warning: Here we have a list of share_mapping using our

4801 # share (usually share_mappings is a list of share_mapping used

4802 # by an instance).

4803 # A share IS NOT used (detachable) if:

4804 # - The share status is INACTIVE or ERROR on our instance.

4805 # - The share status is DETACHING on all other instances.

4806 # +-- reverse the logic as the function check if a share

4807 # | IS used.

4808 # v

4809 return not all(

4810 (

4811 (

4812 sm.instance_uuid == instance_uuid and

4813 (

4814 sm.status

4815 in (

4816 fields.ShareMappingStatus.INACTIVE,

4817 fields.ShareMappingStatus.ERROR,

4818 )

4819 )

4820 ) or

4821 sm.status == fields.ShareMappingStatus.DETACHING

4822 )

4823 for sm in share_mappings_used_by_share

4824 )

4825

4826 try:

4827 compute_utils.notify_about_share_attach_detach(

4828 context,

4829 instance,

4830 instance.host,

4831 action=fields.NotificationAction.SHARE_DETACH,

4832 phase=fields.NotificationPhase.START,

4833 share_id=share_mapping.share_id,

4834 )

4835

4836 still_used = check_share_usage(context, instance.uuid)

4837

4838 share_mapping.set_access_according_to_protocol()

4839

4840 if not still_used:

4841 # self.manila_api.unlock(share_mapping.share_id)

4842 # Explicit unlocking the share is not needed as

4843 # delete_access_rule() from the sdk will do it if the

4844 # "unrestrict" parameter is passed

4845 self.manila_api.deny(

4846 context,

4847 share_mapping.share_id,

4848 share_mapping.access_type,

4849 share_mapping.access_to,

4850 )

4851

4852 share_mapping.delete()

4853

4854 compute_utils.notify_about_share_attach_detach(

4855 context,

4856 instance,

4857 instance.host,

4858 action=fields.NotificationAction.SHARE_DETACH,

4859 phase=fields.NotificationPhase.END,

4860 share_id=share_mapping.share_id,

4861 )

4862

4863 except (

4864 exception.ShareAccessRemovalError,

4865 exception.ShareProtocolNotSupported,

4866 ) as e:

4867 self._set_share_mapping_status(

4868 share_mapping, fields.ShareMappingStatus.ERROR

4869 )

4870 compute_utils.notify_about_share_attach_detach(

4871 context,

4872 instance,

4873 instance.host,

4874 action=fields.NotificationAction.SHARE_DETACH,

4875 phase=fields.NotificationPhase.ERROR,

4876 share_id=share_mapping.share_id,

4877 exception=e

4878 )

4879 LOG.error(e.format_message())

4880 raise

4881 except keystone_exception.http.Unauthorized as e:

4882 self._set_share_mapping_status(

4883 share_mapping, fields.ShareMappingStatus.ERROR

4884 )

4885 compute_utils.notify_about_share_attach_detach(

4886 context,

4887 instance,

4888 instance.host,

4889 action=fields.NotificationAction.SHARE_DETACH,

4890 phase=fields.NotificationPhase.ERROR,

4891 share_id=share_mapping.share_id,

4892 exception=e

4893 )

4894 LOG.error(e)

4895 raise

4896 except (exception.ShareNotFound, exception.ShareAccessNotFound):

4897 # Ignore the error if for any reason there is nothing to

4898 # remove from manila, so we can still detach the share.

4899 share_mapping.delete()

4900 compute_utils.notify_about_share_attach_detach(

4901 context,

4902 instance,

4903 instance.host,

4904 action=fields.NotificationAction.SHARE_DETACH,

4905 phase=fields.NotificationPhase.END,

4906 share_id=share_mapping.share_id,

4907 )

4908

4909 _deny_share(context, instance, share_mapping)

4910

4911 @wrap_exception()

4912 def _mount_all_shares(self, context, instance, share_info):

4913 for share_mapping in share_info:

4914 self._mount_share(context, instance, share_mapping)

4915

4916 @wrap_exception()

4917 def _umount_all_shares(self, context, instance, share_info):

4918 for share_mapping in share_info:

4919 self._umount_share(context, instance, share_mapping)

4920

4921 @wrap_exception()

4922 def _mount_share(self, context, instance, share_mapping):

4923

4924 @utils.synchronized(share_mapping.share_id)

4925 def _mount_share(context, instance, share_mapping):

4926 try:

4927 share_mapping.set_access_according_to_protocol()

4928

4929 if share_mapping.share_proto == (

4930 fields.ShareMappingProto.CEPHFS):

4931 share_mapping.enhance_with_ceph_credentials(context)

4932

4933 LOG.debug("Mounting share %s", share_mapping.share_id)

4934 self.driver.mount_share(context, instance, share_mapping)

4935

4936 except (

4937 exception.ShareNotFound,

4938 exception.ShareProtocolNotSupported,

4939 exception.ShareMountError,

4940 ) as e:

4941 self._set_share_mapping_and_instance_in_error(

4942 instance, share_mapping

4943 )

4944 LOG.error(e.format_message())

4945 raise

4946 except (sdk_exc.BadRequestException) as e:

4947 self._set_share_mapping_and_instance_in_error(

4948 instance, share_mapping

4949 )

4950 LOG.error("%s: %s error from url: %s, %s", e.message, e.source,

4951 e.url, e.details)

4952 raise

4953

4954 _mount_share(context, instance, share_mapping)

4955

4956 @wrap_exception()

4957 def _umount_share(self, context, instance, share_mapping):

4958

4959 @utils.synchronized(share_mapping.share_id)

4960 def _umount_share(context, instance, share_mapping):

4961 try:

4962 share_mapping.set_access_according_to_protocol()

4963

4964 if share_mapping.share_proto == (

4965 fields.ShareMappingProto.CEPHFS):

4966 share_mapping.enhance_with_ceph_credentials(context)

4967

4968 self.driver.umount_share(context, instance, share_mapping)

4969

4970 except (

4971 exception.ShareNotFound,

4972 exception.ShareUmountError,

4973 exception.ShareProtocolNotSupported,

4974 ) as e:

4975 self._set_share_mapping_and_instance_in_error(

4976 instance, share_mapping

4977 )

4978 LOG.error(e.format_message())

4979 raise

4980

4981 _umount_share(context, instance, share_mapping)

4982

4983 def _set_share_mapping_status(self, share_mapping, status):

4984 share_mapping.status = status

4985 share_mapping.save()

4986

4987 def _set_share_mapping_and_instance_in_error(

4988 self, instance, share_mapping

4989 ):

4990 share_mapping.status = fields.ShareMappingStatus.ERROR

4991 share_mapping.save()

4992 self._set_instance_obj_error_state(

4993 instance, clean_task_state=True

4994 )

4995

4996 @wrap_instance_fault

4997 def _rotate_backups(self, context, instance, backup_type, rotation):

4998 """Delete excess backups associated to an instance.

4999

5000 Instances are allowed a fixed number of backups (the rotation number);

5001 this method deletes the oldest backups that exceed the rotation

5002 threshold.

5003

5004 :param context: security context

5005 :param instance: Instance dict

5006 :param backup_type: a user-defined type, like "daily" or "weekly" etc.

5007 :param rotation: int representing how many backups to keep around;

5008 None if rotation shouldn't be used (as in the case of snapshots)

5009 """

5010 filters = {'property-image_type': 'backup',

5011 'property-backup_type': backup_type,

5012 'property-instance_uuid': instance.uuid}

5013

5014 images = self.image_api.get_all(context, filters=filters,

5015 sort_key='created_at', sort_dir='desc')

5016 num_images = len(images)

5017 LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)",

5018 {'num_images': num_images, 'rotation': rotation},

5019 instance=instance)

5020

5021 if num_images > rotation: 5021 ↛ exitline 5021 didn't return from function '_rotate_backups' because the condition on line 5021 was always true

5022 # NOTE(sirp): this deletes all backups that exceed the rotation

5023 # limit

5024 excess = len(images) - rotation

5025 LOG.debug("Rotating out %d backups", excess,

5026 instance=instance)

5027 for i in range(excess):

5028 image = images.pop()

5029 image_id = image['id']

5030 LOG.debug("Deleting image %s", image_id,

5031 instance=instance)

5032 try:

5033 self.image_api.delete(context, image_id)

5034 except exception.ImageNotFound:

5035 LOG.info("Failed to find image %(image_id)s to "

5036 "delete", {'image_id': image_id},

5037 instance=instance)

5038 except (exception.ImageDeleteConflict, Exception) as exc:

5039 LOG.info("Failed to delete image %(image_id)s during "

5040 "deleting excess backups. "

5041 "Continuing for next image.. %(exc)s",

5042 {'image_id': image_id, 'exc': exc},

5043 instance=instance)

5044

5045 @wrap_exception()

5046 @reverts_task_state

5047 @wrap_instance_event(prefix='compute')

5048 @wrap_instance_fault

5049 def set_admin_password(self, context, instance, new_pass):

5050 """Set the root/admin password for an instance on this host.

5051

5052 This is generally only called by API password resets after an

5053 image has been built.

5054

5055 @param context: Nova auth context.

5056 @param instance: Nova instance object.

5057 @param new_pass: The admin password for the instance.

5058 """

5059

5060 context = context.elevated()

5061 current_power_state = self._get_power_state(instance)

5062 expected_state = power_state.RUNNING

5063

5064 if current_power_state != expected_state:

5065 instance.task_state = None

5066 instance.save(expected_task_state=task_states.UPDATING_PASSWORD)

5067 _msg = _('instance %s is not running') % instance.uuid

5068 raise exception.InstancePasswordSetFailed(

5069 instance=instance.uuid, reason=_msg)

5070

5071 try:

5072 self.driver.set_admin_password(instance, new_pass)

5073 LOG.info("Admin password set", instance=instance)

5074 instance.task_state = None

5075 instance.save(

5076 expected_task_state=task_states.UPDATING_PASSWORD)

5077 except exception.InstanceAgentNotEnabled:

5078 with excutils.save_and_reraise_exception():

5079 LOG.debug('Guest agent is not enabled for the instance.',

5080 instance=instance)

5081 instance.task_state = None

5082 instance.save(

5083 expected_task_state=task_states.UPDATING_PASSWORD)

5084 except exception.SetAdminPasswdNotSupported:

5085 with excutils.save_and_reraise_exception():

5086 LOG.info('set_admin_password is not supported '

5087 'by this driver or guest instance.',

5088 instance=instance)

5089 instance.task_state = None

5090 instance.save(

5091 expected_task_state=task_states.UPDATING_PASSWORD)

5092 except NotImplementedError:

5093 LOG.warning('set_admin_password is not implemented '

5094 'by this driver or guest instance.',

5095 instance=instance)

5096 instance.task_state = None

5097 instance.save(

5098 expected_task_state=task_states.UPDATING_PASSWORD)

5099 raise NotImplementedError(_('set_admin_password is not '

5100 'implemented by this driver or guest '

5101 'instance.'))

5102 except exception.UnexpectedTaskStateError:

5103 # interrupted by another (most likely delete) task

5104 # do not retry

5105 raise

5106 except Exception:

5107 # Catch all here because this could be anything.

5108 LOG.exception('set_admin_password failed', instance=instance)

5109 # We create a new exception here so that we won't

5110 # potentially reveal password information to the

5111 # API caller. The real exception is logged above

5112 _msg = _('error setting admin password')

5113 raise exception.InstancePasswordSetFailed(

5114 instance=instance.uuid, reason=_msg)

5115

5116 def _get_rescue_image(self, context, instance, rescue_image_ref=None):

5117 """Determine what image should be used to boot the rescue VM."""

5118 # 1. If rescue_image_ref is passed in, use that for rescue.

5119 # 2. Else, use the base image associated with instance's current image.

5120 # The idea here is to provide the customer with a rescue

5121 # environment which they are familiar with.

5122 # So, if they built their instance off of a Debian image,

5123 # their rescue VM will also be Debian.

5124 # 3. As a last resort, use instance's current image.

5125 if not rescue_image_ref:

5126 system_meta = utils.instance_sys_meta(instance)

5127 rescue_image_ref = system_meta.get('image_base_image_ref')

5128

5129 if not rescue_image_ref:

5130 LOG.warning('Unable to find a different image to use for '

5131 'rescue VM, using instance\'s current image',

5132 instance=instance)

5133 rescue_image_ref = instance.image_ref

5134

5135 return objects.ImageMeta.from_image_ref(

5136 context, self.image_api, rescue_image_ref)

5137

5138 @wrap_exception()

5139 @reverts_task_state

5140 @wrap_instance_event(prefix='compute')

5141 @wrap_instance_fault

5142 def rescue_instance(self, context, instance, rescue_password,

5143 rescue_image_ref, clean_shutdown):

5144 context = context.elevated()

5145 LOG.info('Rescuing', instance=instance)

5146

5147 admin_password = (rescue_password if rescue_password else

5148 utils.generate_password())

5149

5150 network_info = self.network_api.get_instance_nw_info(context, instance)

5151

5152 rescue_image_meta = self._get_rescue_image(context, instance,

5153 rescue_image_ref)

5154

5155 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

5156 context, instance.uuid)

5157 block_device_info = self._get_instance_block_device_info(

5158 context, instance, bdms=bdms)

5159

5160 share_info = self._get_share_info(context, instance)

5161

5162 extra_usage_info = {'rescue_image_name':

5163 self._get_image_name(rescue_image_meta)}

5164 self._notify_about_instance_usage(context, instance,

5165 "rescue.start", extra_usage_info=extra_usage_info,

5166 network_info=network_info)

5167 compute_utils.notify_about_instance_rescue_action(

5168 context, instance, self.host, rescue_image_ref,

5169 phase=fields.NotificationPhase.START)

5170

5171 try:

5172 self._power_off_instance(context, instance, clean_shutdown)

5173

5174 self._mount_all_shares(context, instance, share_info)

5175

5176 self.driver.rescue(context, instance, network_info,

5177 rescue_image_meta, admin_password,

5178 block_device_info, share_info)

5179 except Exception as e:

5180 LOG.exception("Error trying to Rescue Instance",

5181 instance=instance)

5182 self._set_instance_obj_error_state(instance)

5183 raise exception.InstanceNotRescuable(

5184 instance_id=instance.uuid,

5185 reason=_("Driver Error: %s") % e)

5186

5187 compute_utils.notify_usage_exists(self.notifier, context, instance,

5188 self.host, current_period=True)

5189

5190 instance.vm_state = vm_states.RESCUED

5191 instance.task_state = None

5192 instance.power_state = self._get_power_state(instance)

5193 instance.launched_at = timeutils.utcnow()

5194 instance.save(expected_task_state=task_states.RESCUING)

5195

5196 self._notify_about_instance_usage(context, instance,

5197 "rescue.end", extra_usage_info=extra_usage_info,

5198 network_info=network_info)

5199 compute_utils.notify_about_instance_rescue_action(

5200 context, instance, self.host, rescue_image_ref,

5201 phase=fields.NotificationPhase.END)

5202

5203 @wrap_exception()

5204 @reverts_task_state

5205 @wrap_instance_event(prefix='compute')

5206 @wrap_instance_fault

5207 def unrescue_instance(self, context, instance):

5208 orig_context = context

5209 context = context.elevated()

5210 LOG.info('Unrescuing', instance=instance)

5211

5212 network_info = self.network_api.get_instance_nw_info(context, instance)

5213 self._notify_about_instance_usage(context, instance,

5214 "unrescue.start", network_info=network_info)

5215 compute_utils.notify_about_instance_action(context, instance,

5216 self.host, action=fields.NotificationAction.UNRESCUE,

5217 phase=fields.NotificationPhase.START)

5218

5219 with self._error_out_instance_on_exception(context, instance):

5220 self.driver.unrescue(orig_context, instance)

5221

5222 instance.vm_state = vm_states.ACTIVE

5223 instance.task_state = None

5224 instance.power_state = self._get_power_state(instance)

5225 instance.save(expected_task_state=task_states.UNRESCUING)

5226

5227 self._notify_about_instance_usage(context,

5228 instance,

5229 "unrescue.end",

5230 network_info=network_info)

5231 compute_utils.notify_about_instance_action(context, instance,

5232 self.host, action=fields.NotificationAction.UNRESCUE,

5233 phase=fields.NotificationPhase.END)

5234

5235 @wrap_exception()

5236 @wrap_instance_event(prefix='compute')

5237 @errors_out_migration

5238 @wrap_instance_fault

5239 def confirm_resize(self, context, instance, migration):

5240 """Confirms a migration/resize and deletes the 'old' instance.

5241

5242 This is called from the API and runs on the source host.

5243

5244 Nothing needs to happen on the destination host at this point since

5245 the instance is already running there. This routine just cleans up the

5246 source host.

5247 """

5248 @utils.synchronized(instance.uuid)

5249 def do_confirm_resize(context, instance, migration):

5250 LOG.debug("Going to confirm migration %s", migration.id,

5251 instance=instance)

5252

5253 if migration.status == 'confirmed':

5254 LOG.info("Migration %s is already confirmed",

5255 migration.id, instance=instance)

5256 return

5257

5258 if migration.status not in ('finished', 'confirming'):

5259 LOG.warning("Unexpected confirmation status '%(status)s' "

5260 "of migration %(id)s, exit confirmation process",

5261 {"status": migration.status, "id": migration.id},

5262 instance=instance)

5263 return

5264

5265 # NOTE(wangpan): Get the instance from db, if it has been

5266 # deleted, we do nothing and return here

5267 expected_attrs = ['metadata', 'system_metadata', 'flavor']

5268 try:

5269 instance = objects.Instance.get_by_uuid(

5270 context, instance.uuid,

5271 expected_attrs=expected_attrs)

5272 except exception.InstanceNotFound:

5273 LOG.info("Instance is not found during confirmation",

5274 instance=instance)

5275 return

5276

5277 with self._error_out_instance_on_exception(context, instance):

5278 try:

5279 self._confirm_resize(

5280 context, instance, migration=migration)

5281 except Exception:

5282 # Something failed when cleaning up the source host so

5283 # log a traceback and leave a hint about hard rebooting

5284 # the server to correct its state in the DB.

5285 with excutils.save_and_reraise_exception(logger=LOG):

5286 LOG.exception(

5287 'Confirm resize failed on source host %s. '

5288 'Resource allocations in the placement service '

5289 'will be removed regardless because the instance '

5290 'is now on the destination host %s. You can try '

5291 'hard rebooting the instance to correct its '

5292 'state.', self.host, migration.dest_compute,

5293 instance=instance)

5294 finally:

5295 # Whether an error occurred or not, at this point the

5296 # instance is on the dest host. Avoid leaking allocations

5297 # in placement by deleting them here...

5298 self._delete_allocation_after_move(

5299 context, instance, migration)

5300 # ...inform the scheduler about the move...

5301 self._delete_scheduler_instance_info(

5302 context, instance.uuid)

5303 # ...and unset the cached flavor information (this is done

5304 # last since the resource tracker relies on it for its

5305 # periodic tasks)

5306 self._delete_stashed_flavor_info(instance)

5307

5308 do_confirm_resize(context, instance, migration)

5309

5310 def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping):

5311 # NOTE(adrianc): This method returns a copy of nw_info if modifications

5312 # are made else it returns the original nw_info.

5313 updated_nw_info = nw_info

5314 if nw_info and pci_mapping:

5315 updated_nw_info = copy.deepcopy(nw_info)

5316 for vif in updated_nw_info:

5317 if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV:

5318 try:

5319 vif_pci_addr = vif['profile']['pci_slot']

5320 new_addr = pci_mapping[vif_pci_addr].address

5321 vif['profile']['pci_slot'] = new_addr

5322 LOG.debug("Updating VIF's PCI address for VIF %(id)s. "

5323 "Original value %(orig_val)s, "

5324 "new value %(new_val)s",

5325 {'id': vif['id'],

5326 'orig_val': vif_pci_addr,

5327 'new_val': new_addr})

5328 except (KeyError, AttributeError):

5329 with excutils.save_and_reraise_exception():

5330 # NOTE(adrianc): This should never happen. If we

5331 # get here it means there is some inconsistency

5332 # with either 'nw_info' or 'pci_mapping'.

5333 LOG.error("Unexpected error when updating network "

5334 "information with PCI mapping.")

5335 return updated_nw_info

5336

5337 def _confirm_resize(self, context, instance, migration=None):

5338 """Destroys the source instance."""

5339 self._notify_about_instance_usage(context, instance,

5340 "resize.confirm.start")

5341 compute_utils.notify_about_instance_action(context, instance,

5342 self.host, action=fields.NotificationAction.RESIZE_CONFIRM,

5343 phase=fields.NotificationPhase.START)

5344

5345 # NOTE(tr3buchet): tear down networks on source host

5346 self.network_api.setup_networks_on_host(context, instance,

5347 migration.source_compute, teardown=True)

5348

5349 # TODO(stephenfin): These next three calls should be bundled

5350 network_info = self.network_api.get_instance_nw_info(context,

5351 instance)

5352

5353 # NOTE(adrianc): Populate old PCI device in VIF profile

5354 # to allow virt driver to properly unplug it from Hypervisor.

5355 pci_mapping = (instance.migration_context.

5356 get_pci_mapping_for_migration(True))

5357 network_info = self._get_updated_nw_info_with_pci_mapping(

5358 network_info, pci_mapping)

5359

5360 self.driver.confirm_migration(context, migration, instance,

5361 network_info)

5362

5363 # Free up the old_flavor usage from the resource tracker for this host.

5364 self.rt.drop_move_claim_at_source(context, instance, migration)

5365

5366 # NOTE(mriedem): The old_vm_state could be STOPPED but the user

5367 # might have manually powered up the instance to confirm the

5368 # resize/migrate, so we need to check the current power state

5369 # on the instance and set the vm_state appropriately. We default

5370 # to ACTIVE because if the power state is not SHUTDOWN, we

5371 # assume _sync_instance_power_state will clean it up.

5372 p_state = instance.power_state

5373 vm_state = None

5374 if p_state == power_state.SHUTDOWN:

5375 vm_state = vm_states.STOPPED

5376 LOG.debug("Resized/migrated instance is powered off. "

5377 "Setting vm_state to '%s'.", vm_state,

5378 instance=instance)

5379 else:

5380 vm_state = vm_states.ACTIVE

5381

5382 instance.vm_state = vm_state

5383 instance.task_state = None

5384 instance.save(expected_task_state=[None, task_states.DELETING,

5385 task_states.SOFT_DELETING])

5386

5387 self._notify_about_instance_usage(

5388 context, instance, "resize.confirm.end",

5389 network_info=network_info)

5390 compute_utils.notify_about_instance_action(context, instance,

5391 self.host, action=fields.NotificationAction.RESIZE_CONFIRM,

5392 phase=fields.NotificationPhase.END)

5393

5394 def _delete_allocation_after_move(self, context, instance, migration):

5395 """Deletes resource allocations held by the migration record against

5396 the source compute node resource provider after a confirmed cold /

5397 successful live migration.

5398 """

5399 try:

5400 # NOTE(danms): We're finishing on the source node, so try

5401 # to delete the allocation based on the migration uuid

5402 self.reportclient.delete_allocation_for_instance(

5403 context, migration.uuid, consumer_type='migration',

5404 force=False)

5405 except exception.AllocationDeleteFailed:

5406 LOG.error('Deleting allocation in placement for migration '

5407 '%(migration_uuid)s failed. The instance '

5408 '%(instance_uuid)s will be put to ERROR state '

5409 'but the allocation held by the migration is '

5410 'leaked.',

5411 {'instance_uuid': instance.uuid,

5412 'migration_uuid': migration.uuid})

5413 raise

5414

5415 def _delete_stashed_flavor_info(self, instance):

5416 """Remove information about the flavor change after a resize."""

5417 instance.old_flavor = None

5418 instance.new_flavor = None

5419 instance.system_metadata.pop('old_vm_state', None)

5420 instance.save()

5421

5422 @wrap_exception()

5423 @wrap_instance_event(prefix='compute')

5424 @errors_out_migration

5425 @wrap_instance_fault

5426 def confirm_snapshot_based_resize_at_source(

5427 self, ctxt, instance, migration):

5428 """Confirms a snapshot-based resize on the source host.

5429

5430 Cleans the guest from the source hypervisor including disks and drops

5431 the MoveClaim which will free up "old_flavor" usage from the

5432 ResourceTracker.

5433

5434 Deletes the allocations held by the migration consumer against the

5435 source compute node resource provider.

5436

5437 :param ctxt: nova auth request context targeted at the source cell

5438 :param instance: Instance object being resized which should have the

5439 "old_flavor" attribute set

5440 :param migration: Migration object for the resize operation

5441 """

5442

5443 @utils.synchronized(instance.uuid)

5444 def do_confirm():

5445 LOG.info('Confirming resize on source host.', instance=instance)

5446 with self._error_out_instance_on_exception(ctxt, instance):

5447 # TODO(mriedem): Could probably make this try/except/finally

5448 # a context manager to share with confirm_resize().

5449 try:

5450 self._confirm_snapshot_based_resize_at_source(

5451 ctxt, instance, migration)

5452 except Exception:

5453 # Something failed when cleaning up the source host so

5454 # log a traceback and leave a hint about hard rebooting

5455 # the server to correct its state in the DB.

5456 with excutils.save_and_reraise_exception(logger=LOG):

5457 LOG.exception(

5458 'Confirm resize failed on source host %s. '

5459 'Resource allocations in the placement service '

5460 'will be removed regardless because the instance '

5461 'is now on the destination host %s. You can try '

5462 'hard rebooting the instance to correct its '

5463 'state.', self.host, migration.dest_compute,

5464 instance=instance)

5465 finally:

5466 # Whether an error occurred or not, at this point the

5467 # instance is on the dest host so to avoid leaking

5468 # allocations in placement, delete them here.

5469 # TODO(mriedem): Should we catch and just log

5470 # AllocationDeleteFailed? What is the user's recourse if

5471 # we got this far but this fails? At this point the

5472 # instance is on the target host and the allocations

5473 # could just be manually cleaned up by the operator.

5474 self._delete_allocation_after_move(ctxt, instance,

5475 migration)

5476 do_confirm()

5477

5478 def _confirm_snapshot_based_resize_at_source(

5479 self, ctxt, instance, migration):

5480 """Private version of confirm_snapshot_based_resize_at_source

5481

5482 This allows the main method to be decorated with error handlers.

5483

5484 :param ctxt: nova auth request context targeted at the source cell

5485 :param instance: Instance object being resized which should have the

5486 "old_flavor" attribute set

5487 :param migration: Migration object for the resize operation

5488 """

5489 # Cleanup the guest from the hypervisor including local disks.

5490 network_info = self.network_api.get_instance_nw_info(ctxt, instance)

5491 LOG.debug('Cleaning up guest from source hypervisor including disks.',

5492 instance=instance)

5493

5494 # FIXME(mriedem): Per bug 1809095, _confirm_resize calls

5495 # _get_updated_nw_info_with_pci_mapping here prior to unplugging

5496 # VIFs on the source, but in our case we have already unplugged

5497 # VIFs during prep_snapshot_based_resize_at_source, so what do we

5498 # need to do about those kinds of ports? Do we need to wait to unplug

5499 # VIFs until confirm like normal resize?

5500

5501 # Note that prep_snapshot_based_resize_at_source already destroyed the

5502 # guest which disconnected volumes and unplugged VIFs but did not

5503 # destroy disks in case something failed during the resize and the

5504 # instance needed to be rebooted or rebuilt on the source host. Now

5505 # that we are confirming the resize we want to cleanup the disks left

5506 # on the source host. We call cleanup() instead of destroy() to avoid

5507 # any InstanceNotFound confusion from the driver since the guest was

5508 # already destroyed on this host. block_device_info=None and

5509 # destroy_vifs=False means cleanup() will not try to disconnect volumes

5510 # or unplug VIFs.

5511 self.driver.cleanup(

5512 ctxt, instance, network_info, block_device_info=None,

5513 destroy_disks=True, destroy_vifs=False)

5514

5515 # Delete port bindings for the source host.

5516 self._confirm_snapshot_based_resize_delete_port_bindings(

5517 ctxt, instance)

5518

5519 # Delete volume attachments for the source host.

5520 self._delete_volume_attachments(ctxt, instance.get_bdms())

5521

5522 # Free up the old_flavor usage from the resource tracker for this host.

5523 self.rt.drop_move_claim_at_source(ctxt, instance, migration)

5524

5525 def _confirm_snapshot_based_resize_delete_port_bindings(

5526 self, ctxt, instance):

5527 """Delete port bindings for the source host when confirming

5528 snapshot-based resize on the source host."

5529

5530 :param ctxt: nova auth RequestContext

5531 :param instance: Instance object that was resized/cold migrated

5532 """

5533 LOG.debug('Deleting port bindings for source host.',

5534 instance=instance)

5535 try:

5536 self.network_api.cleanup_instance_network_on_host(

5537 ctxt, instance, self.host)

5538 except exception.PortBindingDeletionFailed as e:

5539 # Do not let this stop us from cleaning up since the guest

5540 # is already gone.

5541 LOG.error('Failed to delete port bindings from source host. '

5542 'Error: %s', str(e), instance=instance)

5543

5544 def _delete_volume_attachments(self, ctxt, bdms):

5545 """Deletes volume attachment records for the given bdms.

5546

5547 This method will log but not re-raise any exceptions if the volume

5548 attachment delete fails.

5549

5550 :param ctxt: nova auth request context used to make

5551 DELETE /attachments/{attachment_id} requests to cinder.

5552 :param bdms: objects.BlockDeviceMappingList representing volume

5553 attachments to delete based on BlockDeviceMapping.attachment_id.

5554 """

5555 for bdm in bdms:

5556 if bdm.attachment_id:

5557 try:

5558 self.volume_api.attachment_delete(ctxt, bdm.attachment_id)

5559 except Exception as e:

5560 LOG.error('Failed to delete volume attachment with ID %s. '

5561 'Error: %s', bdm.attachment_id, str(e),

5562 instance_uuid=bdm.instance_uuid)

5563

5564 def _update_bdm_for_swap_to_finish_resize(

5565 self, context, instance, confirm=True):

5566 """This updates bdm.swap with new swap info"""

5567

5568 bdms = instance.get_bdms()

5569 if not (instance.old_flavor and instance.new_flavor):

5570 return bdms

5571

5572 if instance.old_flavor.swap == instance.new_flavor.swap:

5573 return bdms

5574

5575 old_swap = instance.old_flavor.swap

5576 new_swap = instance.new_flavor.swap

5577 if not confirm: 5577 ↛ 5579line 5577 didn't jump to line 5579 because the condition on line 5577 was never true

5578 # revert flavor on _finish_revert_resize

5579 old_swap = instance.new_flavor.swap

5580 new_swap = instance.old_flavor.swap

5581

5582 # add swap

5583 if old_swap == 0 and new_swap:

5584 # (auniyal)old_swap = 0 means we did not have swap bdm

5585 # for this instance.

5586 # and as there is a new_swap, its a swap addition

5587 new_swap_bdm = block_device.create_blank_bdm(new_swap, 'swap')

5588 bdm_obj = objects.BlockDeviceMapping(

5589 context, instance_uuid=instance.uuid, **new_swap_bdm)

5590 bdm_obj.update_or_create()

5591 return instance.get_bdms()

5592

5593 # update swap

5594 for bdm in bdms: 5594 ↛ 5606line 5594 didn't jump to line 5606 because the loop on line 5594 didn't complete

5595 if bdm.guest_format == 'swap' and bdm.device_type == 'disk': 5595 ↛ 5594line 5595 didn't jump to line 5594 because the condition on line 5595 was always true

5596 if new_swap > 0:

5597 LOG.info('Adding swap BDM.', instance=instance)

5598 bdm.volume_size = new_swap

5599 bdm.save()

5600 break

5601 elif new_swap == 0: 5601 ↛ 5594line 5601 didn't jump to line 5594 because the condition on line 5601 was always true

5602 LOG.info('Deleting swap BDM.', instance=instance)

5603 bdm.destroy()

5604 bdms.objects.remove(bdm)

5605 break

5606 return bdms

5607

5608 @wrap_exception()

5609 @reverts_task_state

5610 @wrap_instance_event(prefix='compute')

5611 @errors_out_migration

5612 @wrap_instance_fault

5613 def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration):

5614 """Reverts a snapshot-based resize at the destination host.

5615

5616 Cleans the guest from the destination compute service host hypervisor

5617 and related resources (ports, volumes) and frees resource usage from

5618 the compute service on that host.

5619

5620 :param ctxt: nova auth request context targeted at the target cell

5621 :param instance: Instance object whose vm_state is "resized" and

5622 task_state is "resize_reverting".

5623 :param migration: Migration object whose status is "reverting".

5624 """

5625 # A resize revert is essentially a resize back to the old size, so we

5626 # need to send a usage event here.

5627 compute_utils.notify_usage_exists(

5628 self.notifier, ctxt, instance, self.host, current_period=True)

5629

5630 @utils.synchronized(instance.uuid)

5631 def do_revert():

5632 LOG.info('Reverting resize on destination host.',

5633 instance=instance)

5634 with self._error_out_instance_on_exception(ctxt, instance):

5635 self._revert_snapshot_based_resize_at_dest(

5636 ctxt, instance, migration)

5637 do_revert()

5638

5639 # Broadcast to all schedulers that the instance is no longer on

5640 # this host and clear any waiting callback events. This is best effort

5641 # so if anything fails just log it.

5642 try:

5643 self._delete_scheduler_instance_info(ctxt, instance.uuid)

5644 self.instance_events.clear_events_for_instance(instance)

5645 except Exception as e:

5646 LOG.warning('revert_snapshot_based_resize_at_dest failed during '

5647 'post-processing. Error: %s', e, instance=instance)

5648

5649 def _revert_snapshot_based_resize_at_dest(

5650 self, ctxt, instance, migration):

5651 """Private version of revert_snapshot_based_resize_at_dest.

5652

5653 This allows the main method to be decorated with error handlers.

5654

5655 :param ctxt: nova auth request context targeted at the target cell

5656 :param instance: Instance object whose vm_state is "resized" and

5657 task_state is "resize_reverting".

5658 :param migration: Migration object whose status is "reverting".

5659 """

5660 # Cleanup the guest from the hypervisor including local disks.

5661 network_info = self.network_api.get_instance_nw_info(ctxt, instance)

5662 bdms = instance.get_bdms()

5663 block_device_info = self._get_instance_block_device_info(

5664 ctxt, instance, bdms=bdms)

5665 LOG.debug('Destroying guest from destination hypervisor including '

5666 'disks.', instance=instance)

5667 self.driver.destroy(

5668 ctxt, instance, network_info, block_device_info=block_device_info)

5669

5670 # Activate source host port bindings. We need to do this before

5671 # deleting the (active) dest host port bindings in

5672 # setup_networks_on_host otherwise the ports will be unbound and

5673 # finish on the source will fail.

5674 # migrate_instance_start uses migration.dest_compute for the port

5675 # binding host and since we want to activate the source host port

5676 # bindings, we need to temporarily mutate the migration object.

5677 with utils.temporary_mutation(

5678 migration, dest_compute=migration.source_compute):

5679 LOG.debug('Activating port bindings for source host %s.',

5680 migration.source_compute, instance=instance)

5681 # TODO(mriedem): https://review.opendev.org/#/c/594139/ would allow

5682 # us to remove this and make setup_networks_on_host do it.

5683 # TODO(mriedem): Should we try/except/log any errors but continue?

5684 self.network_api.migrate_instance_start(

5685 ctxt, instance, migration)

5686

5687 # Delete port bindings for the target host.

5688 LOG.debug('Deleting port bindings for target host %s.',

5689 self.host, instance=instance)

5690 try:

5691 # Note that deleting the destination host port bindings does

5692 # not automatically activate the source host port bindings.

5693 self.network_api.cleanup_instance_network_on_host(

5694 ctxt, instance, self.host)

5695 except exception.PortBindingDeletionFailed as e:

5696 # Do not let this stop us from cleaning up since the guest

5697 # is already gone.

5698 LOG.error('Failed to delete port bindings from target host. '

5699 'Error: %s', str(e), instance=instance)

5700

5701 # Delete any volume attachments remaining for this target host.

5702 LOG.debug('Deleting volume attachments for target host.',

5703 instance=instance)

5704 self._delete_volume_attachments(ctxt, bdms)

5705

5706 # Free up the new_flavor usage from the resource tracker for this host.

5707 self.rt.drop_move_claim_at_dest(ctxt, instance, migration)

5708

5709 def _revert_instance_flavor_host_node(self, instance, migration):

5710 """Revert host, node and flavor fields after a resize-revert."""

5711 self._set_instance_info(instance, instance.old_flavor)

5712 instance.host = migration.source_compute

5713 # NOTE(danms): This could fail if we somehow ended up on the wrong

5714 # host without the desired node. That's a big problem, so let the

5715 # ComputeHostNotFound raise from here.

5716 cn = self.rt.get_node_by_name(migration.source_node)

5717 instance.node = cn.hypervisor_hostname

5718 instance.compute_id = cn.id

5719 instance.save(expected_task_state=[task_states.RESIZE_REVERTING])

5720

5721 @wrap_exception()

5722 @reverts_task_state

5723 @wrap_instance_event(prefix='compute')

5724 @errors_out_migration

5725 @wrap_instance_fault

5726 def finish_revert_snapshot_based_resize_at_source(

5727 self, ctxt, instance, migration):

5728 """Reverts a snapshot-based resize at the source host.

5729

5730 Spawn the guest and re-connect volumes/VIFs on the source host and

5731 revert the instance to use the old_flavor for resource usage reporting.

5732

5733 Updates allocations in the placement service to move the source node

5734 allocations, held by the migration record, to the instance and drop

5735 the allocations held by the instance on the destination node.

5736

5737 :param ctxt: nova auth request context targeted at the target cell

5738 :param instance: Instance object whose vm_state is "resized" and

5739 task_state is "resize_reverting".

5740 :param migration: Migration object whose status is "reverting".

5741 """

5742

5743 @utils.synchronized(instance.uuid)

5744 def do_revert():

5745 LOG.info('Reverting resize on source host.', instance=instance)

5746 with self._error_out_instance_on_exception(ctxt, instance):

5747 self._finish_revert_snapshot_based_resize_at_source(

5748 ctxt, instance, migration)

5749

5750 try:

5751 do_revert()

5752 finally:

5753 self._delete_stashed_flavor_info(instance)

5754

5755 # Broadcast to all schedulers that the instance is on this host.

5756 # This is best effort so if anything fails just log it.

5757 try:

5758 self._update_scheduler_instance_info(ctxt, instance)

5759 except Exception as e:

5760 LOG.warning('finish_revert_snapshot_based_resize_at_source failed '

5761 'during post-processing. Error: %s', e,

5762 instance=instance)

5763

5764 def _finish_revert_snapshot_based_resize_at_source(

5765 self, ctxt, instance, migration):

5766 """Private version of finish_revert_snapshot_based_resize_at_source.

5767

5768 This allows the main method to be decorated with error handlers.

5769

5770 :param ctxt: nova auth request context targeted at the source cell

5771 :param instance: Instance object whose vm_state is "resized" and

5772 task_state is "resize_reverting".

5773 :param migration: Migration object whose status is "reverting".

5774 """

5775 # Get stashed old_vm_state information to determine if guest should

5776 # be powered on after spawn; we default to ACTIVE for backwards

5777 # compatibility if old_vm_state is not set

5778 old_vm_state = instance.system_metadata.get(

5779 'old_vm_state', vm_states.ACTIVE)

5780

5781 # Revert the flavor and host/node fields to their previous values

5782 self._revert_instance_flavor_host_node(instance, migration)

5783

5784 # Move the allocations against the source compute node resource

5785 # provider, held by the migration, to the instance which will drop

5786 # the destination compute node resource provider allocations held by

5787 # the instance. This puts the allocations against the source node

5788 # back to the old_flavor and owned by the instance.

5789 try:

5790 self._revert_allocation(ctxt, instance, migration)

5791 except exception.AllocationMoveFailed:

5792 # Log the error but do not re-raise because we want to continue to

5793 # process ports and volumes below.

5794 LOG.error('Reverting allocation in placement for migration '

5795 '%(migration_uuid)s failed. You may need to manually '

5796 'remove the allocations for the migration consumer '

5797 'against the source node resource provider '

5798 '%(source_provider)s and the allocations for the '

5799 'instance consumer against the destination node '

5800 'resource provider %(dest_provider)s and then run the '

5801 '"nova-manage placement heal_allocations" command.',

5802 {'instance_uuid': instance.uuid,

5803 'migration_uuid': migration.uuid,

5804 'source_provider': migration.source_node,

5805 'dest_provider': migration.dest_node},

5806 instance=instance)

5807

5808 bdms = instance.get_bdms()

5809 # prep_snapshot_based_resize_at_source created empty volume attachments

5810 # that we need to update here to get the connection_info before calling

5811 # driver.finish_revert_migration which will connect the volumes to this

5812 # host.

5813 LOG.debug('Updating volume attachments for target host %s.',

5814 self.host, instance=instance)

5815 # TODO(mriedem): We should probably make _update_volume_attachments

5816 # (optionally) graceful to errors so we (1) try to process all

5817 # attachments and (2) continue to process networking below.

5818 self._update_volume_attachments(ctxt, instance, bdms)

5819

5820 LOG.debug('Updating port bindings for source host %s.',

5821 self.host, instance=instance)

5822 # TODO(mriedem): Calculate provider mappings when we support

5823 # cross-cell resize/migrate with ports having resource requests.

5824 # NOTE(hanrong): we need to change migration.dest_compute to

5825 # source host temporarily.

5826 # "network_api.migrate_instance_finish" will setup the network

5827 # for the instance on the destination host. For revert resize,

5828 # the instance will back to the source host, the setup of the

5829 # network for instance should be on the source host. So set

5830 # the migration.dest_compute to source host at here.

5831 with utils.temporary_mutation(

5832 migration, dest_compute=migration.source_compute

5833 ):

5834 self.network_api.migrate_instance_finish(

5835 ctxt, instance, migration, provider_mappings=None)

5836 network_info = self.network_api.get_instance_nw_info(ctxt, instance)

5837

5838 # Remember that prep_snapshot_based_resize_at_source destroyed the

5839 # guest but left the disks intact so we cannot call spawn() here but

5840 # finish_revert_migration should do the job.

5841 block_device_info = self._get_instance_block_device_info(

5842 ctxt, instance, bdms=bdms)

5843 power_on = old_vm_state == vm_states.ACTIVE

5844 driver_error = None

5845 try:

5846 self.driver.finish_revert_migration(

5847 ctxt, instance, network_info, migration,

5848 block_device_info=block_device_info, power_on=power_on)

5849 except Exception as e:

5850 driver_error = e

5851 # Leave a hint about hard rebooting the guest and reraise so the

5852 # instance is put into ERROR state.

5853 with excutils.save_and_reraise_exception(logger=LOG):

5854 LOG.error('An error occurred during finish_revert_migration. '

5855 'The instance may need to be hard rebooted. Error: '

5856 '%s', driver_error, instance=instance)

5857 else:

5858 # Perform final cleanup of the instance in the database.

5859 instance.drop_migration_context()

5860 # If the original vm_state was STOPPED, set it back to STOPPED.

5861 vm_state = vm_states.ACTIVE if power_on else vm_states.STOPPED

5862 self._update_instance_after_spawn(instance, vm_state=vm_state)

5863 instance.save(expected_task_state=[task_states.RESIZE_REVERTING])

5864 finally:

5865 # Complete any volume attachments so the volumes are in-use. We

5866 # do this regardless of finish_revert_migration failing because

5867 # the instance is back on this host now and we do not want to leave

5868 # the volumes in a pending state in case the instance is hard

5869 # rebooted.

5870 LOG.debug('Completing volume attachments for instance on source '

5871 'host.', instance=instance)

5872 with excutils.save_and_reraise_exception(

5873 reraise=driver_error is not None, logger=LOG):

5874 self._complete_volume_attachments(ctxt, bdms)

5875

5876 migration.status = 'reverted'

5877 migration.save()

5878

5879 @wrap_exception()

5880 @reverts_task_state

5881 @wrap_instance_event(prefix='compute')

5882 @errors_out_migration

5883 @wrap_instance_fault

5884 def revert_resize(self, context, instance, migration, request_spec):

5885 """Destroys the new instance on the destination machine.

5886

5887 Reverts the model changes, and powers on the old instance on the

5888 source machine.

5889

5890 """

5891 # NOTE(comstud): A revert_resize is essentially a resize back to

5892 # the old size, so we need to send a usage event here.

5893 compute_utils.notify_usage_exists(self.notifier, context, instance,

5894 self.host, current_period=True)

5895

5896 with self._error_out_instance_on_exception(context, instance):

5897 # NOTE(tr3buchet): tear down networks on destination host

5898 self.network_api.setup_networks_on_host(context, instance,

5899 teardown=True)

5900

5901 self.network_api.migrate_instance_start(context,

5902 instance,

5903 migration)

5904

5905 network_info = self.network_api.get_instance_nw_info(context,

5906 instance)

5907 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

5908 context, instance.uuid)

5909 block_device_info = self._get_instance_block_device_info(

5910 context, instance, bdms=bdms)

5911

5912 destroy_disks = not self._is_instance_storage_shared(

5913 context, instance, host=migration.source_compute)

5914 self.driver.destroy(context, instance, network_info,

5915 block_device_info, destroy_disks)

5916

5917 self._terminate_volume_connections(context, instance, bdms)

5918

5919 # Free up the new_flavor usage from the resource tracker for this

5920 # host.

5921 self.rt.drop_move_claim_at_dest(context, instance, migration)

5922

5923 # RPC cast back to the source host to finish the revert there.

5924 self.compute_rpcapi.finish_revert_resize(context, instance,

5925 migration, migration.source_compute, request_spec)

5926

5927 @wrap_exception()

5928 @reverts_task_state

5929 @wrap_instance_event(prefix='compute')

5930 @errors_out_migration

5931 @wrap_instance_fault

5932 def finish_revert_resize(self, context, instance, migration, request_spec):

5933 """Finishes the second half of reverting a resize on the source host.

5934

5935 Bring the original source instance state back (active/shutoff) and

5936 revert the resized attributes in the database.

5937

5938 """

5939 try:

5940 self._finish_revert_resize(

5941 context, instance, migration, request_spec)

5942 finally:

5943 self._delete_stashed_flavor_info(instance)

5944

5945 def _finish_revert_resize(

5946 self, context, instance, migration, request_spec=None,

5947 ):

5948 """Inner version of finish_revert_resize."""

5949 with self._error_out_instance_on_exception(context, instance):

5950 bdms = self._update_bdm_for_swap_to_finish_resize(

5951 context, instance, confirm=False)

5952

5953 self._notify_about_instance_usage(

5954 context, instance, "resize.revert.start")

5955 compute_utils.notify_about_instance_action(context, instance,

5956 self.host, action=fields.NotificationAction.RESIZE_REVERT,

5957 phase=fields.NotificationPhase.START, bdms=bdms)

5958

5959 # Get stashed old_vm_state information to determine if guest should

5960 # be powered on after spawn; we default to ACTIVE for backwards

5961 # compatibility if old_vm_state is not set

5962 old_vm_state = instance.system_metadata.get(

5963 'old_vm_state', vm_states.ACTIVE)

5964

5965 # Revert the flavor and host/node fields to their previous values

5966 self._revert_instance_flavor_host_node(instance, migration)

5967

5968 try:

5969 source_allocations = self._revert_allocation(

5970 context, instance, migration)

5971 except exception.AllocationMoveFailed:

5972 LOG.error('Reverting allocation in placement for migration '

5973 '%(migration_uuid)s failed. The instance '

5974 '%(instance_uuid)s will be put into ERROR state but '

5975 'the allocation held by the migration is leaked.',

5976 {'instance_uuid': instance.uuid,

5977 'migration_uuid': migration.uuid})

5978 raise

5979

5980 provider_mappings = self._fill_provider_mapping_based_on_allocs(

5981 context, source_allocations, request_spec)

5982

5983 self.network_api.setup_networks_on_host(context, instance,

5984 migration.source_compute)

5985 # NOTE(hanrong): we need to change migration.dest_compute to

5986 # source host temporarily. "network_api.migrate_instance_finish"

5987 # will setup the network for the instance on the destination host.

5988 # For revert resize, the instance will back to the source host, the

5989 # setup of the network for instance should be on the source host.

5990 # So set the migration.dest_compute to source host at here.

5991 with utils.temporary_mutation(

5992 migration, dest_compute=migration.source_compute):

5993 self.network_api.migrate_instance_finish(context,

5994 instance,

5995 migration,

5996 provider_mappings)

5997 network_info = self.network_api.get_instance_nw_info(context,

5998 instance)

5999

6000 # revert_resize deleted any volume attachments for the instance

6001 # and created new ones to be used on this host, but we

6002 # have to update those attachments with the host connector so the

6003 # BDM.connection_info will get set in the call to

6004 # _get_instance_block_device_info below with refresh_conn_info=True

6005 # and then the volumes can be re-connected via the driver on this

6006 # host.

6007 self._update_volume_attachments(context, instance, bdms)

6008

6009 block_device_info = self._get_instance_block_device_info(

6010 context, instance, refresh_conn_info=True, bdms=bdms)

6011

6012 power_on = old_vm_state != vm_states.STOPPED

6013 self.driver.finish_revert_migration(

6014 context, instance, network_info, migration, block_device_info,

6015 power_on)

6016

6017 instance.drop_migration_context()

6018 instance.launched_at = timeutils.utcnow()

6019 instance.save(expected_task_state=task_states.RESIZE_REVERTING)

6020

6021 # Complete any volume attachments so the volumes are in-use.

6022 self._complete_volume_attachments(context, bdms)

6023

6024 # if the original vm state was STOPPED, set it back to STOPPED

6025 LOG.info("Updating instance to original state: '%s'",

6026 old_vm_state, instance=instance)

6027 if power_on: 6027 ↛ 6032line 6027 didn't jump to line 6032 because the condition on line 6027 was always true

6028 instance.vm_state = vm_states.ACTIVE

6029 instance.task_state = None

6030 instance.save()

6031 else:

6032 instance.task_state = task_states.POWERING_OFF

6033 instance.save()

6034 self.stop_instance(context, instance=instance,

6035 clean_shutdown=True)

6036

6037 self._notify_about_instance_usage(

6038 context, instance, "resize.revert.end")

6039 compute_utils.notify_about_instance_action(context, instance,

6040 self.host, action=fields.NotificationAction.RESIZE_REVERT,

6041 phase=fields.NotificationPhase.END, bdms=bdms)

6042

6043 def _fill_provider_mapping_based_on_allocs(

6044 self, context, allocations, request_spec):

6045 """Fills and returns the request group - resource provider mapping

6046 based on the allocation passed in.

6047

6048 :param context: The security context

6049 :param allocation: allocation dict keyed by RP UUID.

6050 :param request_spec: The RequestSpec object associated with the

6051 operation

6052 :returns: None if the request_spec is None. Otherwise a mapping

6053 between RequestGroup requester_id, currently Neutron port_id,

6054 and a list of resource provider UUIDs providing resource for

6055 that RequestGroup.

6056 """

6057 if request_spec:

6058 # NOTE(gibi): We need to re-calculate the resource provider -

6059 # port mapping as we have to have the neutron ports allocate

6060 # from the source compute after revert.

6061 scheduler_utils.fill_provider_mapping_based_on_allocation(

6062 context, self.reportclient, request_spec, allocations)

6063 provider_mappings = self._get_request_group_mapping(

6064 request_spec)

6065 else:

6066 # TODO(sbauza): Remove this conditional once we only support RPC

6067 # API 6.0

6068 # NOTE(gibi): The compute RPC is pinned to be older than 5.2

6069 # and therefore request_spec is not sent. We cannot calculate

6070 # the provider mappings. If the instance has ports with

6071 # resource request then the port update will fail in

6072 # _update_port_binding_for_instance() called via

6073 # migrate_instance_finish() in finish_revert_resize.

6074 provider_mappings = None

6075 return provider_mappings

6076

6077 def _revert_allocation(self, context, instance, migration):

6078 """Revert an allocation that is held by migration to our instance."""

6079

6080 # Fetch the original allocation that the instance had on the source

6081 # node, which are now held by the migration

6082 orig_alloc = self.reportclient.get_allocations_for_consumer(

6083 context, migration.uuid)

6084 if not orig_alloc:

6085 LOG.error('Did not find resource allocations for migration '

6086 '%s on source node %s. Unable to revert source node '

6087 'allocations back to the instance.',

6088 migration.uuid, migration.source_node, instance=instance)

6089 return False

6090

6091 LOG.info('Swapping old allocation on %(rp_uuids)s held by migration '

6092 '%(mig)s for instance',

6093 {'rp_uuids': orig_alloc.keys(), 'mig': migration.uuid},

6094 instance=instance)

6095 # FIXME(gibi): This method is flawed in that it does not handle

6096 # allocations against sharing providers in any special way. This leads

6097 # to duplicate allocations against the sharing provider during

6098 # migration.

6099 # TODO(cdent): Should we be doing anything with return values here?

6100 self.reportclient.move_allocations(context, migration.uuid,

6101 instance.uuid)

6102 return orig_alloc

6103

6104 def _prep_resize(

6105 self, context, image, instance, flavor, filter_properties, node,

6106 migration, request_spec, clean_shutdown=True,

6107 ):

6108

6109 if not filter_properties: 6109 ↛ 6112line 6109 didn't jump to line 6112 because the condition on line 6109 was always true

6110 filter_properties = {}

6111

6112 if not instance.host:

6113 self._set_instance_obj_error_state(instance)

6114 msg = _('Instance has no source host')

6115 raise exception.MigrationError(reason=msg)

6116

6117 same_host = instance.host == self.host

6118 # if the flavor IDs match, it's migrate; otherwise resize

6119 if same_host and flavor.id == instance['instance_type_id']:

6120 # check driver whether support migrate to same host

6121 if not self.driver.capabilities.get(

6122 'supports_migrate_to_same_host', False):

6123 # Raise InstanceFaultRollback so that the

6124 # _error_out_instance_on_exception context manager in

6125 # prep_resize will set the instance.vm_state properly.

6126 raise exception.InstanceFaultRollback(

6127 inner_exception=exception.UnableToMigrateToSelf(

6128 instance_id=instance.uuid, host=self.host))

6129

6130 # NOTE(danms): Stash the new flavor to avoid having to

6131 # look it up in the database later

6132 instance.new_flavor = flavor

6133 # NOTE(mriedem): Stash the old vm_state so we can set the

6134 # resized/reverted instance back to the same state later.

6135 vm_state = instance.vm_state

6136 LOG.debug('Stashing vm_state: %s', vm_state, instance=instance)

6137 instance.system_metadata['old_vm_state'] = vm_state

6138 instance.save()

6139

6140 if not isinstance(request_spec, objects.RequestSpec):

6141 # Prior to compute RPC API 5.1 conductor would pass a legacy dict

6142 # version of the request spec to compute and since Stein compute

6143 # could be sending that back to conductor on reschedule, so if we

6144 # got a dict convert it to an object.

6145 # TODO(mriedem): We can drop this compat code when we only support

6146 # compute RPC API >=6.0.

6147 request_spec = objects.RequestSpec.from_primitives(

6148 context, request_spec, filter_properties)

6149 # We don't have to set the new flavor on the request spec because

6150 # if we got here it was due to a reschedule from the compute and

6151 # the request spec would already have the new flavor in it from the

6152 # else block below.

6153

6154 provider_mapping = self._get_request_group_mapping(request_spec)

6155

6156 if provider_mapping:

6157 try:

6158 compute_utils.update_pci_request_with_placement_allocations(

6159 context,

6160 self.reportclient,

6161 instance.pci_requests.requests,

6162 provider_mapping,

6163 )

6164 except (exception.AmbiguousResourceProviderForPCIRequest,

6165 exception.UnexpectedResourceProviderNameForPCIRequest

6166 ) as e:

6167 raise exception.BuildAbortException(

6168 reason=str(e), instance_uuid=instance.uuid)

6169

6170 limits = filter_properties.get('limits', {})

6171 allocs = self.reportclient.get_allocations_for_consumer(

6172 context, instance.uuid)

6173 with self.rt.resize_claim(

6174 context, instance, flavor, node, migration, allocs,

6175 image_meta=image, limits=limits,

6176 ) as claim:

6177 LOG.info('Migrating', instance=instance)

6178 # RPC cast to the source host to start the actual resize/migration.

6179 self.compute_rpcapi.resize_instance(

6180 context, instance, claim.migration, image,

6181 flavor, request_spec, clean_shutdown)

6182

6183 def _send_prep_resize_notifications(

6184 self, context, instance, phase, flavor):

6185 """Send "resize.prep.*" notifications.

6186

6187 :param context: nova auth request context

6188 :param instance: The instance being resized

6189 :param phase: The phase of the action (NotificationPhase enum)

6190 :param flavor: The (new) flavor for the resize (same as existing

6191 instance.flavor for a cold migration)

6192 """

6193 # Only send notify_usage_exists if it's the "start" phase.

6194 if phase == fields.NotificationPhase.START:

6195 compute_utils.notify_usage_exists(

6196 self.notifier, context, instance, self.host,

6197 current_period=True)

6198

6199 # Send extra usage info about the flavor if it's the "end" phase for

6200 # the legacy unversioned notification.

6201 extra_usage_info = None

6202 if phase == fields.NotificationPhase.END:

6203 extra_usage_info = dict(

6204 new_instance_type=flavor.name,

6205 new_instance_type_id=flavor.id)

6206 self._notify_about_instance_usage(

6207 context, instance, "resize.prep.%s" % phase,

6208 extra_usage_info=extra_usage_info)

6209

6210 # Send the versioned notification.

6211 compute_utils.notify_about_resize_prep_instance(

6212 context, instance, self.host, phase, flavor)

6213

6214 @wrap_exception()

6215 @reverts_task_state

6216 @wrap_instance_event(prefix='compute')

6217 @wrap_instance_fault

6218 def prep_resize(self, context, image, instance, flavor,

6219 request_spec, filter_properties, node,

6220 clean_shutdown, migration, host_list):

6221 """Initiates the process of moving a running instance to another host.

6222

6223 Possibly changes the VCPU, RAM and disk size in the process.

6224

6225 This is initiated from conductor and runs on the destination host.

6226

6227 The main purpose of this method is performing some checks on the

6228 destination host and making a claim for resources. If the claim fails

6229 then a reschedule to another host may be attempted which involves

6230 calling back to conductor to start the process over again.

6231 """

6232 if node is None:

6233 node = self._get_nodename(instance, refresh=True)

6234

6235 # Pass instance_state=instance.vm_state because we can resize

6236 # a STOPPED server and we don't want to set it back to ACTIVE

6237 # in case _prep_resize fails.

6238 instance_state = instance.vm_state

6239 with self._error_out_instance_on_exception(

6240 context, instance, instance_state=instance_state), \

6241 errors_out_migration_ctxt(migration):

6242

6243 self._send_prep_resize_notifications(

6244 context, instance, fields.NotificationPhase.START,

6245 flavor)

6246 try:

6247 scheduler_hints = self._get_scheduler_hints(filter_properties,

6248 request_spec)

6249 # Error out if this host cannot accept the new instance due

6250 # to anti-affinity. At this point the migration is already

6251 # in-progress, so this is the definitive moment to abort due to

6252 # the policy violation. Also, exploding here is covered by the

6253 # cleanup methods in except block.

6254 try:

6255 self._validate_instance_group_policy(context, instance,

6256 scheduler_hints)

6257 except exception.RescheduledException as e:

6258 raise exception.InstanceFaultRollback(inner_exception=e)

6259

6260 self._prep_resize(context, image, instance,

6261 flavor, filter_properties,

6262 node, migration, request_spec,

6263 clean_shutdown)

6264 except exception.BuildAbortException:

6265 # NOTE(gibi): We failed

6266 # update_pci_request_with_placement_allocations so

6267 # there is no reason to re-schedule. Just revert the allocation

6268 # and fail the migration.

6269 with excutils.save_and_reraise_exception():

6270 self._revert_allocation(context, instance, migration)

6271 except Exception:

6272 # Since we hit a failure, we're either rescheduling or dead

6273 # and either way we need to cleanup any allocations created

6274 # by the scheduler for the destination node.

6275 self._revert_allocation(context, instance, migration)

6276 # try to re-schedule the resize elsewhere:

6277 exc_info = sys.exc_info()

6278 self._reschedule_resize_or_reraise(context, instance,

6279 exc_info, flavor, request_spec,

6280 filter_properties, host_list)

6281 finally:

6282 self._send_prep_resize_notifications(

6283 context, instance, fields.NotificationPhase.END,

6284 flavor)

6285

6286 def _reschedule_resize_or_reraise(self, context, instance, exc_info,

6287 flavor, request_spec, filter_properties, host_list):

6288 """Try to re-schedule the resize or re-raise the original error to

6289 error out the instance.

6290 """

6291 if not filter_properties:

6292 filter_properties = {}

6293

6294 rescheduled = False

6295 instance_uuid = instance.uuid

6296

6297 try:

6298 retry = filter_properties.get('retry')

6299 if retry:

6300 LOG.debug('Rescheduling, attempt %d', retry['num_attempts'],

6301 instance_uuid=instance_uuid)

6302

6303 # reset the task state

6304 task_state = task_states.RESIZE_PREP

6305 self._instance_update(context, instance, task_state=task_state)

6306

6307 if exc_info: 6307 ↛ 6313line 6307 didn't jump to line 6313 because the condition on line 6307 was always true

6308 # stringify to avoid circular ref problem in json

6309 # serialization

6310 retry['exc'] = traceback.format_exception_only(

6311 exc_info[0], exc_info[1])

6312

6313 scheduler_hint = {'filter_properties': filter_properties}

6314

6315 self.compute_task_api.resize_instance(

6316 context, instance, scheduler_hint, flavor,

6317 request_spec=request_spec, host_list=host_list)

6318

6319 rescheduled = True

6320 else:

6321 # no retry information, do not reschedule.

6322 LOG.debug('Retry info not present, will not reschedule',

6323 instance_uuid=instance_uuid)

6324 rescheduled = False

6325 except Exception as error:

6326 rescheduled = False

6327 LOG.exception("Error trying to reschedule",

6328 instance_uuid=instance_uuid)

6329 compute_utils.add_instance_fault_from_exc(context,

6330 instance, error,

6331 exc_info=sys.exc_info())

6332 self._notify_about_instance_usage(context, instance,

6333 'resize.error', fault=error)

6334 compute_utils.notify_about_instance_action(

6335 context, instance, self.host,

6336 action=fields.NotificationAction.RESIZE,

6337 phase=fields.NotificationPhase.ERROR,

6338 exception=error,

6339 )

6340

6341 if rescheduled:

6342 self._log_original_error(exc_info, instance_uuid)

6343 compute_utils.add_instance_fault_from_exc(context,

6344 instance, exc_info[1], exc_info=exc_info)

6345 self._notify_about_instance_usage(context, instance,

6346 'resize.error', fault=exc_info[1])

6347 compute_utils.notify_about_instance_action(

6348 context, instance, self.host,

6349 action=fields.NotificationAction.RESIZE,

6350 phase=fields.NotificationPhase.ERROR,

6351 exception=exc_info[1],

6352 )

6353 else:

6354 # not re-scheduling

6355 exc = exc_info[1] or exc_info[0]()

6356 if exc.__traceback__ is not exc_info[2]: 6356 ↛ 6357line 6356 didn't jump to line 6357 because the condition on line 6356 was never true

6357 raise exc.with_traceback(exc_info[2])

6358 raise exc

6359

6360 @messaging.expected_exceptions(exception.MigrationPreCheckError)

6361 @wrap_exception()

6362 @wrap_instance_event(prefix='compute')

6363 @wrap_instance_fault

6364 def prep_snapshot_based_resize_at_dest(

6365 self, ctxt, instance, flavor, nodename, migration, limits):

6366 """Performs pre-cross-cell resize resource claim on the dest host.

6367

6368 This runs on the destination host in a cross-cell resize operation

6369 before the resize is actually started.

6370

6371 Performs a resize_claim for resources that are not claimed in placement

6372 like PCI devices and NUMA topology.

6373

6374 Note that this is different from same-cell prep_resize in that this:

6375

6376 * Does not RPC cast to the source compute, that is orchestrated from

6377 conductor.

6378 * This does not reschedule on failure, conductor handles that since

6379 conductor is synchronously RPC calling this method. As such, the

6380 reverts_task_state decorator is not used on this method.

6381

6382 :param ctxt: user auth request context

6383 :param instance: the instance being resized

6384 :param flavor: the flavor being resized to (unchanged for cold migrate)

6385 :param nodename: Name of the target compute node

6386 :param migration: nova.objects.Migration object for the operation

6387 :param limits: nova.objects.SchedulerLimits object of resource limits

6388 :returns: nova.objects.MigrationContext; the migration context created

6389 on the destination host during the resize_claim.

6390 :raises: nova.exception.MigrationPreCheckError if the pre-check

6391 validation fails for the given host selection

6392 """

6393 LOG.debug('Checking if we can cross-cell migrate instance to this '

6394 'host (%s).', self.host, instance=instance)

6395 self._send_prep_resize_notifications(

6396 ctxt, instance, fields.NotificationPhase.START, flavor)

6397 # TODO(mriedem): update_pci_request_with_placement_allocations

6398 # should be called here if the request spec has request group mappings,

6399 # e.g. for things like QoS ports with resource requests. Do it outside

6400 # the try/except so if it raises BuildAbortException we do not attempt

6401 # to reschedule.

6402 try:

6403 # Get the allocations within the try/except block in case we get

6404 # an error so MigrationPreCheckError is raised up.

6405 allocations = self.reportclient.get_allocs_for_consumer(

6406 ctxt, instance.uuid)['allocations']

6407 # Claim resources on this target host using the new flavor which

6408 # will create the MigrationContext object. Note that in the future

6409 # if we want to do other validation here we should do it within

6410 # the MoveClaim context so we can drop the claim if anything fails.

6411 self.rt.resize_claim(

6412 ctxt, instance, flavor, nodename, migration, allocations,

6413 image_meta=instance.image_meta, limits=limits)

6414 except Exception as ex:

6415 err = str(ex)

6416 LOG.warning(

6417 'Cross-cell resize pre-checks failed for this host (%s). '

6418 'Cleaning up. Failure: %s', self.host, err,

6419 instance=instance, exc_info=True)

6420 raise exception.MigrationPreCheckError(

6421 reason=(_("Pre-checks failed on host '%(host)s'. "

6422 "Error: %(error)s") %

6423 {'host': self.host, 'error': err}))

6424 finally:

6425 self._send_prep_resize_notifications(

6426 ctxt, instance, fields.NotificationPhase.END, flavor)

6427

6428 # ResourceTracker.resize_claim() sets instance.migration_context.

6429 return instance.migration_context

6430

6431 @messaging.expected_exceptions(exception.InstancePowerOffFailure)

6432 @wrap_exception()

6433 @reverts_task_state

6434 @wrap_instance_event(prefix='compute')

6435 @errors_out_migration

6436 @wrap_instance_fault

6437 def prep_snapshot_based_resize_at_source(

6438 self, ctxt, instance, migration, snapshot_id=None):

6439 """Prepares the instance at the source host for cross-cell resize

6440

6441 Performs actions like powering off the guest, upload snapshot data if

6442 the instance is not volume-backed, disconnecting volumes, unplugging

6443 VIFs and activating the destination host port bindings.

6444

6445 :param ctxt: user auth request context targeted at source cell

6446 :param instance: nova.objects.Instance; the instance being resized.

6447 The expected instance.task_state is "resize_migrating" when calling

6448 this method, and the expected task_state upon successful completion

6449 is "resize_migrated".

6450 :param migration: nova.objects.Migration object for the operation.

6451 The expected migration.status is "pre-migrating" when calling this

6452 method and the expected status upon successful completion is

6453 "post-migrating".

6454 :param snapshot_id: ID of the image snapshot to upload if not a

6455 volume-backed instance

6456 :raises: nova.exception.InstancePowerOffFailure if stopping the

6457 instance fails

6458 """

6459 LOG.info('Preparing for snapshot based resize on source host %s.',

6460 self.host, instance=instance)

6461 # Note that if anything fails here, the migration-based allocations

6462 # created in conductor should be reverted by conductor as well,

6463 # see MigrationTask.rollback.

6464 self._prep_snapshot_based_resize_at_source(

6465 ctxt, instance, migration, snapshot_id=snapshot_id)

6466

6467 @delete_image_on_error

6468 def _snapshot_for_resize(self, ctxt, image_id, instance):

6469 """Uploads snapshot for the instance during a snapshot-based resize

6470

6471 If the snapshot operation fails the image will be deleted.

6472

6473 :param ctxt: the nova auth request context for the resize operation

6474 :param image_id: the snapshot image ID

6475 :param instance: the instance to snapshot/resize

6476 """

6477 LOG.debug('Uploading snapshot data for image %s', image_id,

6478 instance=instance)

6479 # Note that we do not track the snapshot phase task states

6480 # during resize since we do not want to reflect those into the

6481 # actual instance.task_state.

6482 update_task_state = lambda *args, **kwargs: None

6483 with timeutils.StopWatch() as timer:

6484 self.driver.snapshot(ctxt, instance, image_id, update_task_state)

6485 LOG.debug('Took %0.2f seconds to snapshot the instance on '

6486 'the hypervisor.', timer.elapsed(), instance=instance)

6487

6488 def _prep_snapshot_based_resize_at_source(

6489 self, ctxt, instance, migration, snapshot_id=None):

6490 """Private method for prep_snapshot_based_resize_at_source so calling

6491 code can handle errors and perform rollbacks as necessary.

6492 """

6493 # Fetch and update the instance.info_cache.

6494 network_info = self.network_api.get_instance_nw_info(ctxt, instance)

6495 # Get the BDMs attached to this instance on this source host.

6496 bdms = instance.get_bdms()

6497 # Send the resize.start notification.

6498 self._send_resize_instance_notifications(

6499 ctxt, instance, bdms, network_info, fields.NotificationPhase.START)

6500 # Update the migration status from "pre-migrating" to "migrating".

6501 migration.status = 'migrating'

6502 migration.save()

6503

6504 # Since the instance is going to be left on the source host during the

6505 # resize, we need to power it off so we do not have the instance

6506 # potentially running in two places.

6507 LOG.debug('Stopping instance', instance=instance)

6508 try:

6509 self._power_off_instance(ctxt, instance)

6510 except Exception as e:

6511 LOG.exception('Failed to power off instance.', instance=instance)

6512 raise exception.InstancePowerOffFailure(reason=str(e))

6513 instance.power_state = self._get_power_state(instance)

6514

6515 # If a snapshot image ID was provided, we need to snapshot the guest

6516 # disk image and upload it to the image service.

6517 if snapshot_id:

6518 self._snapshot_for_resize(ctxt, snapshot_id, instance)

6519

6520 block_device_info = self._get_instance_block_device_info(

6521 ctxt, instance, bdms=bdms)

6522

6523 # If something fails at this point the instance must go to ERROR

6524 # status for operator intervention or to reboot/rebuild the instance.

6525 with self._error_out_instance_on_exception(

6526 ctxt, instance, instance_state=vm_states.ERROR):

6527

6528 # Destroy the guest on the source host which will disconnect

6529 # volumes and unplug VIFs. Note that we DO NOT destroy disks since

6530 # we want to leave those on the source host in case of a later

6531 # failure and disks are needed to recover the guest or in case the

6532 # resize is reverted.

6533 LOG.debug('Destroying guest on source host but retaining disks.',

6534 instance=instance)

6535 self.driver.destroy(

6536 ctxt, instance, network_info,

6537 block_device_info=block_device_info, destroy_disks=False)

6538

6539 # At this point the volumes are disconnected from this source host.

6540 # Delete the old volume attachment records and create new empty

6541 # ones which will be used later if the resize is reverted.

6542 LOG.debug('Deleting volume attachments for the source host.',

6543 instance=instance)

6544 self._terminate_volume_connections(ctxt, instance, bdms)

6545

6546 # At this point the VIFs are unplugged from this source host.

6547 # Activate the dest host port bindings created by conductor.

6548 self.network_api.migrate_instance_start(ctxt, instance, migration)

6549

6550 # Update the migration status from "migrating" to "post-migrating".

6551 migration.status = 'post-migrating'

6552 migration.save()

6553

6554 # At this point, the traditional resize_instance would update the

6555 # instance host/node values to point at the dest host/node because

6556 # that is where the disk is transferred during resize_instance, but

6557 # with cross-cell resize the instance is not yet at the dest host

6558 # so we do not make that update here.

6559 instance.task_state = task_states.RESIZE_MIGRATED

6560 instance.save(expected_task_state=task_states.RESIZE_MIGRATING)

6561

6562 self._send_resize_instance_notifications(

6563 ctxt, instance, bdms, network_info,

6564 fields.NotificationPhase.END)

6565 self.instance_events.clear_events_for_instance(instance)

6566

6567 @wrap_exception()

6568 @reverts_task_state

6569 @wrap_instance_event(prefix='compute')

6570 @wrap_instance_fault

6571 def resize_instance(self, context, instance, image,

6572 migration, flavor, clean_shutdown,

6573 request_spec):

6574 """Starts the migration of a running instance to another host.

6575

6576 This is initiated from the destination host's ``prep_resize`` routine

6577 and runs on the source host.

6578 """

6579 try:

6580 self._resize_instance(context, instance, image, migration,

6581 flavor, clean_shutdown, request_spec)

6582 except Exception:

6583 with excutils.save_and_reraise_exception():

6584 self._revert_allocation(context, instance, migration)

6585

6586 def _resize_instance(

6587 self, context, instance, image, migration, flavor,

6588 clean_shutdown, request_spec,

6589 ):

6590 # Pass instance_state=instance.vm_state because we can resize

6591 # a STOPPED server and we don't want to set it back to ACTIVE

6592 # in case migrate_disk_and_power_off raises InstanceFaultRollback.

6593 instance_state = instance.vm_state

6594 with self._error_out_instance_on_exception(

6595 context, instance, instance_state=instance_state), \

6596 errors_out_migration_ctxt(migration):

6597 network_info = self.network_api.get_instance_nw_info(context,

6598 instance)

6599

6600 migration.status = 'migrating'

6601 migration.save()

6602

6603 instance.task_state = task_states.RESIZE_MIGRATING

6604 instance.save(expected_task_state=task_states.RESIZE_PREP)

6605

6606 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

6607 context, instance.uuid)

6608 self._send_resize_instance_notifications(

6609 context, instance, bdms, network_info,

6610 fields.NotificationPhase.START)

6611

6612 block_device_info = self._get_instance_block_device_info(

6613 context, instance, bdms=bdms)

6614

6615 timeout, retry_interval = self._get_power_off_values(

6616 instance, clean_shutdown)

6617 disk_info = self.driver.migrate_disk_and_power_off(

6618 context, instance, migration.dest_host,

6619 flavor, network_info,

6620 block_device_info,

6621 timeout, retry_interval)

6622

6623 self._terminate_volume_connections(context, instance, bdms)

6624

6625 self.network_api.migrate_instance_start(context,

6626 instance,

6627 migration)

6628

6629 migration.status = 'post-migrating'

6630 migration.save()

6631

6632 instance.host = migration.dest_compute

6633 instance.node = migration.dest_node

6634 # NOTE(danms): This could fail if the dest_compute_id was not set

6635 # by the sending host *and* migration.dest_node does not point to

6636 # a legit node on the dest_compute. Since that would be very bad,

6637 # let the ComputeHostNotFound raise from here.

6638 instance.compute_id = migration.get_dest_compute_id()

6639 # NOTE(gibi): as the instance now tracked on the destination we

6640 # have to make sure that the source compute resource track can

6641 # track this instance as a migration. For that the resource tracker

6642 # needs to see the old_flavor set on the instance. The old_flavor

6643 # setting used to be done on the destination host in finish_resize

6644 # but that is racy with a source host update_available_resource

6645 # periodic run

6646 instance.old_flavor = instance.flavor

6647 instance.task_state = task_states.RESIZE_MIGRATED

6648 instance.save(expected_task_state=task_states.RESIZE_MIGRATING)

6649

6650 # RPC cast to the destination host to finish the resize/migration.

6651 self.compute_rpcapi.finish_resize(context, instance,

6652 migration, image, disk_info, migration.dest_compute,

6653 request_spec)

6654

6655 self._send_resize_instance_notifications(

6656 context, instance, bdms, network_info,

6657 fields.NotificationPhase.END)

6658 self.instance_events.clear_events_for_instance(instance)

6659

6660 def _send_resize_instance_notifications(

6661 self, context, instance, bdms, network_info, phase):

6662 """Send "resize.(start|end)" notifications.

6663

6664 :param context: nova auth request context

6665 :param instance: The instance being resized

6666 :param bdms: BlockDeviceMappingList for the BDMs associated with the

6667 instance

6668 :param network_info: NetworkInfo for the instance info cache of ports

6669 :param phase: The phase of the action (NotificationPhase enum, either

6670 ``start`` or ``end``)

6671 """

6672 action = fields.NotificationAction.RESIZE

6673 # Send the legacy unversioned notification.

6674 self._notify_about_instance_usage(

6675 context, instance, "%s.%s" % (action, phase),

6676 network_info=network_info)

6677 # Send the versioned notification.

6678 compute_utils.notify_about_instance_action(

6679 context, instance, self.host, action=action, phase=phase,

6680 bdms=bdms)

6681

6682 def _terminate_volume_connections(self, context, instance, bdms):

6683 connector = None

6684 for bdm in bdms:

6685 if bdm.is_volume:

6686 if bdm.attachment_id:

6687 # NOTE(jdg): So here's the thing, the idea behind the new

6688 # attach API's was to have a new code fork/path that we

6689 # followed, we're not going to do that so we have to do

6690 # some extra work in here to make it *behave* just like the

6691 # old code. Cinder doesn't allow disconnect/reconnect (you

6692 # just delete the attachment and get a new one)

6693 # attachments in the new attach code so we have to do

6694 # a delete and create without a connector (reserve),

6695 # in other words, beware

6696 attachment_id = self.volume_api.attachment_create(

6697 context, bdm.volume_id, instance.uuid)['id']

6698 self.volume_api.attachment_delete(context,

6699 bdm.attachment_id)

6700 bdm.attachment_id = attachment_id

6701 bdm.save()

6702

6703 else:

6704 if connector is None:

6705 connector = self.driver.get_volume_connector(instance)

6706 self.volume_api.terminate_connection(context,

6707 bdm.volume_id,

6708 connector)

6709

6710 @staticmethod

6711 def _set_instance_info(instance, flavor):

6712 instance.instance_type_id = flavor.id

6713 instance.memory_mb = flavor.memory_mb

6714 instance.vcpus = flavor.vcpus

6715 instance.root_gb = flavor.root_gb

6716 instance.ephemeral_gb = flavor.ephemeral_gb

6717 instance.flavor = flavor

6718

6719 def _update_volume_attachments(self, context, instance, bdms):

6720 """Updates volume attachments using the virt driver host connector.

6721

6722 :param context: nova.context.RequestContext - user request context

6723 :param instance: nova.objects.Instance

6724 :param bdms: nova.objects.BlockDeviceMappingList - the list of block

6725 device mappings for the given instance

6726 """

6727 if bdms:

6728 connector = None

6729 for bdm in bdms:

6730 if bdm.is_volume and bdm.attachment_id:

6731 if connector is None: 6731 ↛ 6733line 6731 didn't jump to line 6733 because the condition on line 6731 was always true

6732 connector = self.driver.get_volume_connector(instance)

6733 self.volume_api.attachment_update(

6734 context, bdm.attachment_id, connector, bdm.device_name)

6735

6736 def _complete_volume_attachments(self, context, bdms):

6737 """Completes volume attachments for the instance

6738

6739 :param context: nova.context.RequestContext - user request context

6740 :param bdms: nova.objects.BlockDeviceMappingList - the list of block

6741 device mappings for the given instance

6742 """

6743 if bdms:

6744 for bdm in bdms:

6745 if bdm.is_volume and bdm.attachment_id:

6746 self.volume_api.attachment_complete(

6747 context, bdm.attachment_id)

6748

6749 def _finish_resize(self, context, instance, migration, disk_info,

6750 image_meta, bdms, request_spec):

6751 resize_instance = False # indicates disks have been resized

6752 old_instance_type_id = migration.old_instance_type_id

6753 new_instance_type_id = migration.new_instance_type_id

6754 old_flavor = instance.flavor # the current flavor is now old

6755 # NOTE(mriedem): Get the old_vm_state so we know if we should

6756 # power on the instance. If old_vm_state is not set we need to default

6757 # to ACTIVE for backwards compatibility

6758 old_vm_state = instance.system_metadata.get('old_vm_state',

6759 vm_states.ACTIVE)

6760 # NOTE(gibi): this is already set by the resize_instance on the source

6761 # node before calling finish_resize on destination but during upgrade

6762 # it can be that the source node is not having the fix for bug 1944759

6763 # yet. This assignment can be removed in Z release.

6764 instance.old_flavor = old_flavor

6765

6766 if old_instance_type_id != new_instance_type_id: 6766 ↛ 6775line 6766 didn't jump to line 6775 because the condition on line 6766 was always true

6767 new_flavor = instance.new_flavor # this is set in _prep_resize

6768 # Set the flavor-related fields on the instance object including

6769 # making instance.flavor = new_flavor.

6770 self._set_instance_info(instance, new_flavor)

6771 for key in ('root_gb', 'swap', 'ephemeral_gb'):

6772 if old_flavor[key] != new_flavor[key]:

6773 resize_instance = True

6774 break

6775 instance.apply_migration_context()

6776

6777 # NOTE(tr3buchet): setup networks on destination host

6778 self.network_api.setup_networks_on_host(context, instance,

6779 migration.dest_compute)

6780 provider_mappings = self._get_request_group_mapping(request_spec)

6781

6782 # For neutron, migrate_instance_finish updates port bindings for this

6783 # host including any PCI devices claimed for SR-IOV ports.

6784 self.network_api.migrate_instance_finish(

6785 context, instance, migration, provider_mappings)

6786

6787 network_info = self.network_api.get_instance_nw_info(context, instance)

6788

6789 instance.task_state = task_states.RESIZE_FINISH

6790 instance.save(expected_task_state=task_states.RESIZE_MIGRATED)

6791

6792 self._send_finish_resize_notifications(

6793 context, instance, bdms, network_info,

6794 fields.NotificationPhase.START)

6795

6796 # We need to update any volume attachments using the destination

6797 # host connector so that we can update the BDM.connection_info

6798 # before calling driver.finish_migration otherwise the driver

6799 # won't know how to connect the volumes to this host.

6800 # Note that _get_instance_block_device_info with

6801 # refresh_conn_info=True will update the BDM.connection_info value

6802 # in the database so we must do this before calling that method.

6803 self._update_volume_attachments(context, instance, bdms)

6804

6805 block_device_info = self._get_instance_block_device_info(

6806 context, instance, refresh_conn_info=True, bdms=bdms)

6807

6808 # NOTE(mriedem): If the original vm_state was STOPPED, we don't

6809 # automatically power on the instance after it's migrated

6810 power_on = old_vm_state != vm_states.STOPPED

6811

6812 # NOTE(sbauza): During a migration, the original allocation is against

6813 # the migration UUID while the target allocation (for the destination

6814 # node) is related to the instance UUID, so here we need to pass the

6815 # new ones.

6816 allocations = self.reportclient.get_allocs_for_consumer(

6817 context, instance.uuid)['allocations']

6818

6819 try:

6820 self.driver.finish_migration(context, migration, instance,

6821 disk_info,

6822 network_info,

6823 image_meta, resize_instance,

6824 allocations,

6825 block_device_info, power_on)

6826 except Exception:

6827 # Note that we do not rollback port bindings to the source host

6828 # because resize_instance (on the source host) updated the

6829 # instance.host to point to *this* host (the destination host)

6830 # so the port bindings pointing at this host are correct even

6831 # though we failed to create the guest.

6832 with excutils.save_and_reraise_exception():

6833 # If we failed to create the guest on this host, reset the

6834 # instance flavor-related fields to the old flavor. An

6835 # error handler like reverts_task_state will save the changes.

6836 if old_instance_type_id != new_instance_type_id: 6836 ↛ 6840line 6836 didn't jump to line 6840

6837 self._set_instance_info(instance, old_flavor)

6838

6839 # Now complete any volume attachments that were previously updated.

6840 self._complete_volume_attachments(context, bdms)

6841

6842 migration.status = 'finished'

6843 migration.save()

6844

6845 instance.vm_state = vm_states.RESIZED

6846 instance.task_state = None

6847 instance.launched_at = timeutils.utcnow()

6848 instance.save(expected_task_state=task_states.RESIZE_FINISH)

6849

6850 return network_info

6851

6852 @wrap_exception()

6853 @reverts_task_state

6854 @wrap_instance_event(prefix='compute')

6855 @errors_out_migration

6856 @wrap_instance_fault

6857 def finish_resize(self, context, disk_info, image, instance,

6858 migration, request_spec):

6859 """Completes the migration process.

6860

6861 Sets up the newly transferred disk and turns on the instance at its

6862 new host machine.

6863

6864 """

6865 try:

6866 self._finish_resize_helper(context, disk_info, image, instance,

6867 migration, request_spec)

6868 except Exception:

6869 with excutils.save_and_reraise_exception():

6870 # At this point, resize_instance (which runs on the source) has

6871 # already updated the instance host/node values to point to

6872 # this (the dest) compute, so we need to leave the allocations

6873 # against the dest node resource provider intact and drop the

6874 # allocations against the source node resource provider. If the

6875 # user tries to recover the server by hard rebooting it, it

6876 # will happen on this host so that's where the allocations

6877 # should go. Note that this is the same method called from

6878 # confirm_resize to cleanup the source node allocations held

6879 # by the migration record.

6880 LOG.info('Deleting allocations for old flavor on source node '

6881 '%s after finish_resize failure. You may be able to '

6882 'recover the instance by hard rebooting it.',

6883 migration.source_compute, instance=instance)

6884 self._delete_allocation_after_move(

6885 context, instance, migration)

6886

6887 def _finish_resize_helper(self, context, disk_info, image, instance,

6888 migration, request_spec):

6889 """Completes the migration process.

6890

6891 The caller must revert the instance's allocations if the migration

6892 process failed.

6893 """

6894 bdms = self._update_bdm_for_swap_to_finish_resize(context, instance)

6895

6896 with self._error_out_instance_on_exception(context, instance):

6897 image_meta = objects.ImageMeta.from_dict(image)

6898 network_info = self._finish_resize(context, instance, migration,

6899 disk_info, image_meta, bdms,

6900 request_spec)

6901

6902 # TODO(melwitt): We should clean up instance console tokens here. The

6903 # instance is on a new host and will need to establish a new console

6904 # connection.

6905 self._update_scheduler_instance_info(context, instance)

6906 self._send_finish_resize_notifications(

6907 context, instance, bdms, network_info,

6908 fields.NotificationPhase.END)

6909

6910 def _send_finish_resize_notifications(

6911 self, context, instance, bdms, network_info, phase):

6912 """Send notifications for the finish_resize flow.

6913

6914 :param context: nova auth request context

6915 :param instance: The instance being resized

6916 :param bdms: BlockDeviceMappingList for the BDMs associated with the

6917 instance

6918 :param network_info: NetworkInfo for the instance info cache of ports

6919 :param phase: The phase of the action (NotificationPhase enum, either

6920 ``start`` or ``end``)

6921 """

6922 # Send the legacy unversioned notification.

6923 self._notify_about_instance_usage(

6924 context, instance, "finish_resize.%s" % phase,

6925 network_info=network_info)

6926 # Send the versioned notification.

6927 compute_utils.notify_about_instance_action(

6928 context, instance, self.host,

6929 action=fields.NotificationAction.RESIZE_FINISH, phase=phase,

6930 bdms=bdms)

6931

6932 @wrap_exception()

6933 @reverts_task_state

6934 @wrap_instance_event(prefix='compute')

6935 @errors_out_migration

6936 @wrap_instance_fault

6937 def finish_snapshot_based_resize_at_dest(

6938 self, ctxt, instance, migration, snapshot_id):

6939 """Finishes the snapshot-based resize at the destination compute.

6940

6941 Sets up block devices and networking on the destination compute and

6942 spawns the guest.

6943

6944 :param ctxt: nova auth request context targeted at the target cell DB

6945 :param instance: The Instance object being resized with the

6946 ``migration_context`` field set. Upon successful completion of this

6947 method the vm_state should be "resized", the task_state should be

6948 None, and migration context, host/node and flavor-related fields

6949 should be set on the instance.

6950 :param migration: The Migration object for this resize operation. Upon

6951 successful completion of this method the migration status should

6952 be "finished".

6953 :param snapshot_id: ID of the image snapshot created for a

6954 non-volume-backed instance, else None.

6955 """

6956 LOG.info('Finishing snapshot based resize on destination host %s.',

6957 self.host, instance=instance)

6958 with self._error_out_instance_on_exception(ctxt, instance):

6959 # Note that if anything fails here, the migration-based allocations

6960 # created in conductor should be reverted by conductor as well,

6961 # see MigrationTask.rollback.

6962 self._finish_snapshot_based_resize_at_dest(

6963 ctxt, instance, migration, snapshot_id)

6964

6965 def _finish_snapshot_based_resize_at_dest(

6966 self, ctxt, instance, migration, snapshot_id):

6967 """Private variant of finish_snapshot_based_resize_at_dest so the

6968 caller can handle reverting resource allocations on failure and perform

6969 other generic error handling.

6970 """

6971 # Figure out the image metadata to use when spawning the guest.

6972 origin_image_ref = instance.image_ref

6973 if snapshot_id:

6974 instance.image_ref = snapshot_id

6975 image_meta = objects.ImageMeta.from_image_ref(

6976 ctxt, self.image_api, snapshot_id)

6977 else:

6978 # Just use what is already on the volume-backed instance.

6979 image_meta = instance.image_meta

6980

6981 resize = migration.migration_type == 'resize'

6982 instance.old_flavor = instance.flavor

6983 if resize: 6983 ↛ 6991line 6983 didn't jump to line 6991 because the condition on line 6983 was always true

6984 flavor = instance.new_flavor

6985 # If we are resizing to a new flavor we need to set the

6986 # flavor-related fields on the instance.

6987 # NOTE(mriedem): This is likely where storing old/new_flavor on

6988 # the MigrationContext would make this cleaner.

6989 self._set_instance_info(instance, flavor)

6990

6991 instance.apply_migration_context()

6992 instance.task_state = task_states.RESIZE_FINISH

6993 instance.save(expected_task_state=task_states.RESIZE_MIGRATED)

6994

6995 # This seems a bit late to be sending the start notification but

6996 # it is what traditional resize has always done as well and it does

6997 # contain the changes to the instance with the new_flavor and

6998 # task_state.

6999 bdms = instance.get_bdms()

7000 network_info = instance.get_network_info()

7001 self._send_finish_resize_notifications(

7002 ctxt, instance, bdms, network_info,

7003 fields.NotificationPhase.START)

7004

7005 # Setup volumes and networking and spawn the guest in the hypervisor.

7006 self._finish_snapshot_based_resize_at_dest_spawn(

7007 ctxt, instance, migration, image_meta, bdms)

7008

7009 # If we spawned from a temporary snapshot image we can delete that now,

7010 # similar to how unshelve works.

7011 if snapshot_id:

7012 instance.image_ref = origin_image_ref

7013 compute_utils.delete_image(

7014 ctxt, instance, self.image_api, snapshot_id)

7015

7016 migration.status = 'finished'

7017 migration.save()

7018

7019 self._update_instance_after_spawn(instance, vm_state=vm_states.RESIZED)

7020 # Setting the host/node values will make the ResourceTracker continue

7021 # to track usage for this instance on this host.

7022 instance.host = migration.dest_compute

7023 # NOTE(danms): This could fail if we somehow ended up on the wrong

7024 # host without the desired node. That's a big problem, so let the

7025 # ComputeHostNotFound raise from here.

7026 cn = self.rt.get_node_by_name(migration.dest_node)

7027 instance.compute_id = cn.id

7028 instance.node = cn.hypervisor_hostname

7029 instance.save(expected_task_state=task_states.RESIZE_FINISH)

7030

7031 # Broadcast to all schedulers that the instance is on this host.

7032 self._update_scheduler_instance_info(ctxt, instance)

7033 self._send_finish_resize_notifications(

7034 ctxt, instance, bdms, network_info,

7035 fields.NotificationPhase.END)

7036

7037 def _finish_snapshot_based_resize_at_dest_spawn(

7038 self, ctxt, instance, migration, image_meta, bdms):

7039 """Sets up volumes and networking and spawns the guest on the dest host

7040

7041 If the instance was stopped when the resize was initiated the guest

7042 will be created but remain in a shutdown power state.

7043

7044 If the spawn fails, port bindings are rolled back to the source host

7045 and volume connections are terminated for this dest host.

7046

7047 :param ctxt: nova auth request context

7048 :param instance: Instance object being migrated

7049 :param migration: Migration object for the operation

7050 :param image_meta: ImageMeta object used during driver.spawn

7051 :param bdms: BlockDeviceMappingList of BDMs for the instance

7052 """

7053 # Update the volume attachments using this host's connector.

7054 # That will update the BlockDeviceMapping.connection_info which

7055 # will be used to connect the volumes on this host during spawn().

7056 block_device_info = self._prep_block_device(ctxt, instance, bdms)

7057

7058 allocations = self.reportclient.get_allocations_for_consumer(

7059 ctxt, instance.uuid)

7060

7061 # We do not call self.network_api.setup_networks_on_host here because

7062 # for neutron that sets up the port migration profile which is only

7063 # used during live migration with DVR. Yes it is gross knowing what

7064 # that method does internally. We could change this when bug 1814837

7065 # is fixed if setup_networks_on_host is made smarter by passing the

7066 # migration record and the method checks the migration_type.

7067

7068 # Activate the port bindings for this host.

7069 # FIXME(mriedem): We're going to have the same issue as bug 1813789

7070 # here because this will update the port bindings and send the

7071 # network-vif-plugged event and that means when driver.spawn waits for

7072 # it we might have already gotten the event and neutron won't send

7073 # another one so we could timeout.

7074 # TODO(mriedem): Calculate provider mappings when we support cross-cell

7075 # resize/migrate with ports having resource requests.

7076 self.network_api.migrate_instance_finish(

7077 ctxt, instance, migration, provider_mappings=None)

7078 network_info = self.network_api.get_instance_nw_info(ctxt, instance)

7079

7080 # If the original vm_state was STOPPED, we do not automatically

7081 # power on the instance after it is migrated.

7082 power_on = instance.system_metadata['old_vm_state'] == vm_states.ACTIVE

7083 try:

7084 # NOTE(mriedem): If this instance uses a config drive, it will get

7085 # rebuilt here which means any personality files will be lost,

7086 # similar to unshelve. If the instance is not using a config drive

7087 # and getting metadata from the metadata API service, personality

7088 # files would be lost regardless of the move operation.

7089 self.driver.spawn(

7090 ctxt, instance, image_meta, injected_files=[],

7091 admin_password=None, allocations=allocations,

7092 network_info=network_info, block_device_info=block_device_info,

7093 power_on=power_on)

7094 except Exception:

7095 with excutils.save_and_reraise_exception(logger=LOG):

7096 # Rollback port bindings to the source host.

7097 try:

7098 # This is gross but migrate_instance_start looks at the

7099 # migration.dest_compute to determine where to activate the

7100 # port bindings and we want the source compute port

7101 # bindings to be re-activated. Remember at this point the

7102 # instance.host is still pointing at the source compute.

7103 # TODO(mriedem): Maybe we should be calling

7104 # setup_instance_network_on_host here to deal with pci

7105 # devices?

7106 with utils.temporary_mutation(

7107 migration, dest_compute=migration.source_compute):

7108 self.network_api.migrate_instance_start(

7109 ctxt, instance, migration)

7110 except Exception:

7111 LOG.exception(

7112 'Failed to activate port bindings on the source '

7113 'host: %s', migration.source_compute,

7114 instance=instance)

7115

7116 # Rollback volume connections on this host.

7117 for bdm in bdms:

7118 if bdm.is_volume:

7119 try:

7120 self._remove_volume_connection(

7121 ctxt, bdm, instance, delete_attachment=True)

7122 except Exception:

7123 LOG.exception('Failed to remove volume connection '

7124 'on this host %s for volume %s.',

7125 self.host, bdm.volume_id,

7126 instance=instance)

7127

7128 @wrap_exception()

7129 @wrap_instance_fault

7130 def add_fixed_ip_to_instance(self, context, network_id, instance):

7131 """Calls network_api to add new fixed_ip to instance

7132 then injects the new network info and resets instance networking.

7133

7134 """

7135 self._notify_about_instance_usage(

7136 context, instance, "create_ip.start")

7137

7138 network_info = self.network_api.add_fixed_ip_to_instance(context,

7139 instance,

7140 network_id)

7141 self._inject_network_info(instance, network_info)

7142

7143 # NOTE(russellb) We just want to bump updated_at. See bug 1143466.

7144 instance.updated_at = timeutils.utcnow()

7145 instance.save()

7146

7147 self._notify_about_instance_usage(

7148 context, instance, "create_ip.end", network_info=network_info)

7149

7150 @wrap_exception()

7151 @wrap_instance_fault

7152 def remove_fixed_ip_from_instance(self, context, address, instance):

7153 """Calls network_api to remove existing fixed_ip from instance

7154 by injecting the altered network info and resetting

7155 instance networking.

7156 """

7157 self._notify_about_instance_usage(

7158 context, instance, "delete_ip.start")

7159

7160 network_info = self.network_api.remove_fixed_ip_from_instance(context,

7161 instance,

7162 address)

7163 self._inject_network_info(instance, network_info)

7164

7165 # NOTE(russellb) We just want to bump updated_at. See bug 1143466.

7166 instance.updated_at = timeutils.utcnow()

7167 instance.save()

7168

7169 self._notify_about_instance_usage(

7170 context, instance, "delete_ip.end", network_info=network_info)

7171

7172 @wrap_exception()

7173 @reverts_task_state

7174 @wrap_instance_event(prefix='compute')

7175 @wrap_instance_fault

7176 def pause_instance(self, context, instance):

7177 """Pause an instance on this host."""

7178 context = context.elevated()

7179 LOG.info('Pausing', instance=instance)

7180 self._notify_about_instance_usage(context, instance, 'pause.start')

7181 compute_utils.notify_about_instance_action(context, instance,

7182 self.host, action=fields.NotificationAction.PAUSE,

7183 phase=fields.NotificationPhase.START)

7184 self.driver.pause(instance)

7185 instance.power_state = self._get_power_state(instance)

7186 instance.vm_state = vm_states.PAUSED

7187 instance.task_state = None

7188 instance.save(expected_task_state=task_states.PAUSING)

7189 self._notify_about_instance_usage(context, instance, 'pause.end')

7190 compute_utils.notify_about_instance_action(context, instance,

7191 self.host, action=fields.NotificationAction.PAUSE,

7192 phase=fields.NotificationPhase.END)

7193

7194 @wrap_exception()

7195 @reverts_task_state

7196 @wrap_instance_event(prefix='compute')

7197 @wrap_instance_fault

7198 def unpause_instance(self, context, instance):

7199 """Unpause a paused instance on this host."""

7200 context = context.elevated()

7201 LOG.info('Unpausing', instance=instance)

7202 self._notify_about_instance_usage(context, instance, 'unpause.start')

7203 compute_utils.notify_about_instance_action(context, instance,

7204 self.host, action=fields.NotificationAction.UNPAUSE,

7205 phase=fields.NotificationPhase.START)

7206 self.driver.unpause(instance)

7207 instance.power_state = self._get_power_state(instance)

7208 instance.vm_state = vm_states.ACTIVE

7209 instance.task_state = None

7210 instance.save(expected_task_state=task_states.UNPAUSING)

7211 self._notify_about_instance_usage(context, instance, 'unpause.end')

7212 compute_utils.notify_about_instance_action(context, instance,

7213 self.host, action=fields.NotificationAction.UNPAUSE,

7214 phase=fields.NotificationPhase.END)

7215

7216 @wrap_exception()

7217 def host_power_action(self, context, action):

7218 """Reboots, shuts down or powers up the host."""

7219 return self.driver.host_power_action(action)

7220

7221 @wrap_exception()

7222 def host_maintenance_mode(self, context, host, mode):

7223 """Start/Stop host maintenance window. On start, it triggers

7224 guest VMs evacuation.

7225 """

7226 return self.driver.host_maintenance_mode(host, mode)

7227

7228 def _update_compute_provider_status(self, context, enabled):

7229 """Adds or removes the COMPUTE_STATUS_DISABLED trait for this host.

7230

7231 For each ComputeNode managed by this service, adds or removes the

7232 COMPUTE_STATUS_DISABLED traits to/from the associated resource provider

7233 in Placement.

7234

7235 :param context: nova auth RequestContext

7236 :param enabled: True if the node is enabled in which case the trait

7237 would be removed, False if the node is disabled in which case

7238 the trait would be added.

7239 :raises: ComputeHostNotFound if there are no compute nodes found in

7240 the ResourceTracker for this service.

7241 """

7242 # Get the compute node(s) on this host. Remember that ironic can be

7243 # managing more than one compute node.

7244 nodes = self.rt.compute_nodes.values()

7245 if not nodes:

7246 raise exception.ComputeHostNotFound(host=self.host)

7247 # For each node, we want to add (or remove) the COMPUTE_STATUS_DISABLED

7248 # trait on the related resource provider in placement so the scheduler

7249 # (pre-)filters the provider based on its status.

7250 for node in nodes:

7251 try:

7252 self.virtapi.update_compute_provider_status(

7253 context, node.uuid, enabled)

7254 except (exception.ResourceProviderTraitRetrievalFailed,

7255 exception.ResourceProviderUpdateConflict,

7256 exception.ResourceProviderUpdateFailed,

7257 exception.TraitRetrievalFailed) as e:

7258 # This is best effort so just log a warning and continue.

7259 LOG.warning('An error occurred while updating '

7260 'COMPUTE_STATUS_DISABLED trait on compute node '

7261 'resource provider %s. The trait will be '

7262 'synchronized when the update_available_resource '

7263 'periodic task runs. Error: %s',

7264 node.uuid, e.format_message())

7265 except Exception:

7266 LOG.exception('An error occurred while updating '

7267 'COMPUTE_STATUS_DISABLED trait on compute node '

7268 'resource provider %s. The trait will be '

7269 'synchronized when the '

7270 'update_available_resource periodic task runs.',

7271 node.uuid)

7272

7273 @wrap_exception()

7274 def set_host_enabled(self, context, enabled):

7275 """Sets the specified host's ability to accept new instances.

7276

7277 This method will add or remove the COMPUTE_STATUS_DISABLED trait

7278 to/from the associated compute node resource provider(s) for this

7279 compute service.

7280 """

7281 try:

7282 self._update_compute_provider_status(context, enabled)

7283 except exception.ComputeHostNotFound:

7284 LOG.warning('Unable to add/remove trait COMPUTE_STATUS_DISABLED. '

7285 'No ComputeNode(s) found for host: %s', self.host)

7286

7287 try:

7288 return self.driver.set_host_enabled(enabled)

7289 except NotImplementedError:

7290 # Only the xenapi driver implements set_host_enabled but we don't

7291 # want NotImplementedError to get raised back to the API. We still

7292 # need to honor the compute RPC API contract and return 'enabled'

7293 # or 'disabled' though.

7294 return 'enabled' if enabled else 'disabled'

7295

7296 @wrap_exception()

7297 def get_host_uptime(self, context):

7298 """Returns the result of calling "uptime" on the target host."""

7299 return self.driver.get_host_uptime()

7300

7301 @wrap_exception()

7302 @wrap_instance_fault

7303 def get_diagnostics(self, context, instance):

7304 """Retrieve diagnostics for an instance on this host."""

7305 current_power_state = self._get_power_state(instance)

7306 if current_power_state == power_state.RUNNING: 7306 ↛ 7310line 7306 didn't jump to line 7310 because the condition on line 7306 was always true

7307 LOG.info("Retrieving diagnostics", instance=instance)

7308 return self.driver.get_diagnostics(instance)

7309 else:

7310 raise exception.InstanceInvalidState(

7311 attr='power state',

7312 instance_uuid=instance.uuid,

7313 state=power_state.STATE_MAP[instance.power_state],

7314 method='get_diagnostics')

7315

7316 @wrap_exception()

7317 @wrap_instance_fault

7318 def get_instance_diagnostics(self, context, instance):

7319 """Retrieve diagnostics for an instance on this host."""

7320 current_power_state = self._get_power_state(instance)

7321 if current_power_state == power_state.RUNNING: 7321 ↛ 7325line 7321 didn't jump to line 7325 because the condition on line 7321 was always true

7322 LOG.info("Retrieving diagnostics", instance=instance)

7323 return self.driver.get_instance_diagnostics(instance)

7324 else:

7325 raise exception.InstanceInvalidState(

7326 attr='power state',

7327 instance_uuid=instance.uuid,

7328 state=power_state.STATE_MAP[instance.power_state],

7329 method='get_diagnostics')

7330

7331 @wrap_exception()

7332 @reverts_task_state

7333 @wrap_instance_event(prefix='compute')

7334 @wrap_instance_fault

7335 def suspend_instance(self, context, instance):

7336 """Suspend the given instance."""

7337 context = context.elevated()

7338

7339 # Store the old state

7340 instance.system_metadata['old_vm_state'] = instance.vm_state

7341 self._notify_about_instance_usage(context, instance, 'suspend.start')

7342 compute_utils.notify_about_instance_action(context, instance,

7343 self.host, action=fields.NotificationAction.SUSPEND,

7344 phase=fields.NotificationPhase.START)

7345 with self._error_out_instance_on_exception(context, instance,

7346 instance_state=instance.vm_state):

7347 self.driver.suspend(context, instance)

7348 instance.power_state = self._get_power_state(instance)

7349 instance.vm_state = vm_states.SUSPENDED

7350 instance.task_state = None

7351 instance.save(expected_task_state=task_states.SUSPENDING)

7352 self._notify_about_instance_usage(context, instance, 'suspend.end')

7353 compute_utils.notify_about_instance_action(context, instance,

7354 self.host, action=fields.NotificationAction.SUSPEND,

7355 phase=fields.NotificationPhase.END)

7356

7357 @wrap_exception()

7358 @reverts_task_state

7359 @wrap_instance_event(prefix='compute')

7360 @wrap_instance_fault

7361 def resume_instance(self, context, instance):

7362 """Resume the given suspended instance."""

7363 context = context.elevated()

7364 LOG.info('Resuming', instance=instance)

7365

7366 self._notify_about_instance_usage(context, instance, 'resume.start')

7367

7368 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

7369 context, instance.uuid)

7370 block_device_info = self._get_instance_block_device_info(

7371 context, instance, bdms=bdms)

7372

7373 # This allows passing share_info to the resume operation for

7374 # futur usage. However, this scenario is currently not possible

7375 # because suspending an instance with a share is not permitted

7376 # by libvirt. As a result, the suspend action involving a share

7377 # is blocked by the API.

7378 share_info = self._get_share_info(context, instance)

7379

7380 compute_utils.notify_about_instance_action(context, instance,

7381 self.host, action=fields.NotificationAction.RESUME,

7382 phase=fields.NotificationPhase.START, bdms=bdms)

7383

7384 network_info = self.network_api.get_instance_nw_info(context, instance)

7385

7386 with self._error_out_instance_on_exception(context, instance,

7387 instance_state=instance.vm_state):

7388 self.driver.resume(context, instance, network_info,

7389 block_device_info, share_info)

7390

7391 instance.power_state = self._get_power_state(instance)

7392

7393 # We default to the ACTIVE state for backwards compatibility

7394 instance.vm_state = instance.system_metadata.pop('old_vm_state',

7395 vm_states.ACTIVE)

7396

7397 instance.task_state = None

7398 instance.save(expected_task_state=task_states.RESUMING)

7399 self._notify_about_instance_usage(context, instance, 'resume.end')

7400 compute_utils.notify_about_instance_action(context, instance,

7401 self.host, action=fields.NotificationAction.RESUME,

7402 phase=fields.NotificationPhase.END, bdms=bdms)

7403

7404 @wrap_exception()

7405 @reverts_task_state

7406 @wrap_instance_event(prefix='compute')

7407 @wrap_instance_fault

7408 def shelve_instance(self, context, instance, image_id,

7409 clean_shutdown, accel_uuids):

7410 """Shelve an instance.

7411

7412 This should be used when you want to take a snapshot of the instance.

7413 It also adds system_metadata that can be used by a periodic task to

7414 offload the shelved instance after a period of time.

7415

7416 :param context: request context

7417 :param instance: an Instance object

7418 :param image_id: an image id to snapshot to.

7419 :param clean_shutdown: give the GuestOS a chance to stop

7420 :param accel_uuids: the accelerators uuids for the instance

7421 """

7422

7423 @utils.synchronized(instance.uuid)

7424 def do_shelve_instance():

7425 self._shelve_instance(context, instance, image_id, clean_shutdown,

7426 accel_uuids)

7427 do_shelve_instance()

7428

7429 def _shelve_instance(self, context, instance, image_id,

7430 clean_shutdown, accel_uuids=None):

7431 LOG.info('Shelving', instance=instance)

7432 offload = CONF.shelved_offload_time == 0

7433 if offload:

7434 # Get the BDMs early so we can pass them into versioned

7435 # notifications since _shelve_offload_instance needs the

7436 # BDMs anyway.

7437 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

7438 context, instance.uuid)

7439 else:

7440 bdms = None

7441 compute_utils.notify_usage_exists(self.notifier, context, instance,

7442 self.host, current_period=True)

7443 self._notify_about_instance_usage(context, instance, 'shelve.start')

7444 compute_utils.notify_about_instance_action(context, instance,

7445 self.host, action=fields.NotificationAction.SHELVE,

7446 phase=fields.NotificationPhase.START, bdms=bdms)

7447

7448 def update_task_state(task_state, expected_state=task_states.SHELVING):

7449 shelving_state_map = {

7450 task_states.IMAGE_PENDING_UPLOAD:

7451 task_states.SHELVING_IMAGE_PENDING_UPLOAD,

7452 task_states.IMAGE_UPLOADING:

7453 task_states.SHELVING_IMAGE_UPLOADING,

7454 task_states.SHELVING: task_states.SHELVING}

7455 task_state = shelving_state_map[task_state]

7456 expected_state = shelving_state_map[expected_state]

7457 instance.task_state = task_state

7458 instance.save(expected_task_state=expected_state)

7459 # Do not attempt a clean shutdown of a paused guest since some

7460 # hypervisors will fail the clean shutdown if the guest is not

7461 # running.

7462 if instance.power_state == power_state.PAUSED:

7463 clean_shutdown = False

7464 self._power_off_instance(context, instance, clean_shutdown)

7465 self.driver.snapshot(context, instance, image_id, update_task_state)

7466

7467 instance.system_metadata['shelved_at'] = timeutils.utcnow().isoformat()

7468 instance.system_metadata['shelved_image_id'] = image_id

7469 instance.system_metadata['shelved_host'] = self.host

7470 instance.vm_state = vm_states.SHELVED

7471 instance.task_state = None

7472 if offload:

7473 instance.task_state = task_states.SHELVING_OFFLOADING

7474 instance.power_state = self._get_power_state(instance)

7475 instance.save(expected_task_state=[

7476 task_states.SHELVING,

7477 task_states.SHELVING_IMAGE_UPLOADING])

7478

7479 self._notify_about_instance_usage(context, instance, 'shelve.end')

7480 compute_utils.notify_about_instance_action(context, instance,

7481 self.host, action=fields.NotificationAction.SHELVE,

7482 phase=fields.NotificationPhase.END, bdms=bdms)

7483

7484 if offload:

7485 self._shelve_offload_instance(

7486 context, instance, clean_shutdown=False, bdms=bdms,

7487 accel_uuids=accel_uuids)

7488

7489 @wrap_exception()

7490 @reverts_task_state

7491 @wrap_instance_event(prefix='compute')

7492 @wrap_instance_fault

7493 def shelve_offload_instance(self, context, instance, clean_shutdown,

7494 accel_uuids):

7495 """Remove a shelved instance from the hypervisor.

7496

7497 This frees up those resources for use by other instances, but may lead

7498 to slower unshelve times for this instance. This method is used by

7499 volume backed instances since restoring them doesn't involve the

7500 potentially large download of an image.

7501

7502 :param context: request context

7503 :param instance: nova.objects.instance.Instance

7504 :param clean_shutdown: give the GuestOS a chance to stop

7505 :param accel_uuids: the accelerators uuids for the instance

7506 """

7507

7508 @utils.synchronized(instance.uuid)

7509 def do_shelve_offload_instance():

7510 self._shelve_offload_instance(context, instance, clean_shutdown,

7511 accel_uuids=accel_uuids)

7512 do_shelve_offload_instance()

7513

7514 def _shelve_offload_instance(self, context, instance, clean_shutdown,

7515 bdms=None, accel_uuids=None):

7516 LOG.info('Shelve offloading', instance=instance)

7517 if bdms is None:

7518 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

7519 context, instance.uuid)

7520 self._notify_about_instance_usage(context, instance,

7521 'shelve_offload.start')

7522 compute_utils.notify_about_instance_action(context, instance,

7523 self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,

7524 phase=fields.NotificationPhase.START, bdms=bdms)

7525

7526 self._power_off_instance(context, instance, clean_shutdown)

7527 current_power_state = self._get_power_state(instance)

7528 network_info = self.network_api.get_instance_nw_info(context, instance)

7529

7530 ports_id = [vif['id'] for vif in network_info]

7531 self.network_api.unbind_ports(context, ports_id, detach=False)

7532

7533 block_device_info = self._get_instance_block_device_info(context,

7534 instance,

7535 bdms=bdms)

7536 self.driver.destroy(context, instance, network_info,

7537 block_device_info)

7538

7539 # the instance is going to be removed from the host so we want to

7540 # terminate all the connections with the volume server and the host

7541 self._terminate_volume_connections(context, instance, bdms)

7542

7543 # NOTE(brinzhang): Free up the accelerator resource occupied

7544 # in the cyborg service.

7545 if accel_uuids:

7546 cyclient = cyborg.get_client(context)

7547 cyclient.delete_arqs_for_instance(instance.uuid)

7548

7549 # Free up the resource allocations in the placement service.

7550 # This should happen *before* the vm_state is changed to

7551 # SHELVED_OFFLOADED in case client-side code is polling the API to

7552 # schedule more instances (or unshelve) once this server is offloaded.

7553 self.rt.delete_allocation_for_shelve_offloaded_instance(context,

7554 instance)

7555

7556 instance.power_state = current_power_state

7557 # NOTE(mriedem): The vm_state has to be set before updating the

7558 # resource tracker, see vm_states.allow_resource_removal(). The

7559 # host/node values cannot be nulled out until after updating the

7560 # resource tracker though.

7561 instance.vm_state = vm_states.SHELVED_OFFLOADED

7562 instance.task_state = None

7563 instance.save(expected_task_state=[task_states.SHELVING,

7564 task_states.SHELVING_OFFLOADING])

7565

7566 # NOTE(ndipanov): Free resources from the resource tracker

7567 self._update_resource_tracker(context, instance)

7568

7569 # NOTE(sfinucan): RPC calls should no longer be attempted against this

7570 # instance, so ensure any calls result in errors

7571 self._nil_out_instance_obj_host_and_node(instance)

7572 instance.save(expected_task_state=None)

7573

7574 # TODO(melwitt): We should clean up instance console tokens here. The

7575 # instance has no host at this point and will need to establish a new

7576 # console connection in the future after it is unshelved.

7577 self._delete_scheduler_instance_info(context, instance.uuid)

7578 self._notify_about_instance_usage(context, instance,

7579 'shelve_offload.end')

7580 compute_utils.notify_about_instance_action(context, instance,

7581 self.host, action=fields.NotificationAction.SHELVE_OFFLOAD,

7582 phase=fields.NotificationPhase.END, bdms=bdms)

7583

7584 @wrap_exception()

7585 @reverts_task_state

7586 @wrap_instance_event(prefix='compute')

7587 @wrap_instance_fault

7588 def unshelve_instance(self, context, instance, image,

7589 filter_properties, node, request_spec, accel_uuids):

7590 """Unshelve the instance.

7591

7592 :param context: request context

7593 :param instance: a nova.objects.instance.Instance object

7594 :param image: an image to build from. If None we assume a

7595 volume backed instance.

7596 :param filter_properties: dict containing limits, retry info etc.

7597 :param node: target compute node

7598 :param request_spec: the RequestSpec object used to schedule the

7599 instance

7600 :param accel_uuids: the accelerators uuids for the instance

7601 """

7602 if filter_properties is None: 7602 ↛ 7603line 7602 didn't jump to line 7603 because the condition on line 7602 was never true

7603 filter_properties = {}

7604

7605 @utils.synchronized(instance.uuid)

7606 def do_unshelve_instance():

7607 self._unshelve_instance(

7608 context, instance, image, filter_properties, node,

7609 request_spec, accel_uuids)

7610 do_unshelve_instance()

7611

7612 def _unshelve_instance_key_scrub(self, instance):

7613 """Remove data from the instance that may cause side effects."""

7614 cleaned_keys = dict(

7615 key_data=instance.key_data,

7616 auto_disk_config=instance.auto_disk_config)

7617 instance.key_data = None

7618 instance.auto_disk_config = False

7619 return cleaned_keys

7620

7621 def _unshelve_instance_key_restore(self, instance, keys):

7622 """Restore previously scrubbed keys before saving the instance."""

7623 instance.update(keys)

7624

7625 def _unshelve_instance(self, context, instance, image, filter_properties,

7626 node, request_spec, accel_uuids):

7627 LOG.info('Unshelving', instance=instance)

7628 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

7629 context, instance.uuid)

7630

7631 self._notify_about_instance_usage(context, instance, 'unshelve.start')

7632 compute_utils.notify_about_instance_action(context, instance,

7633 self.host, action=fields.NotificationAction.UNSHELVE,

7634 phase=fields.NotificationPhase.START, bdms=bdms)

7635

7636 instance.task_state = task_states.SPAWNING

7637 instance.save()

7638

7639 block_device_info = self._prep_block_device(context, instance, bdms)

7640 scrubbed_keys = self._unshelve_instance_key_scrub(instance)

7641

7642 if node is None: 7642 ↛ 7643line 7642 didn't jump to line 7643 because the condition on line 7642 was never true

7643 node = self._get_nodename(instance)

7644

7645 limits = filter_properties.get('limits', {})

7646

7647 allocations = self.reportclient.get_allocations_for_consumer(

7648 context, instance.uuid)

7649

7650 shelved_image_ref = instance.image_ref

7651 if image:

7652 instance.image_ref = image['id']

7653 image_meta = objects.ImageMeta.from_dict(image)

7654 else:

7655 image_meta = objects.ImageMeta.from_dict(

7656 utils.get_image_from_system_metadata(

7657 instance.system_metadata))

7658

7659 provider_mappings = self._get_request_group_mapping(request_spec)

7660

7661 try:

7662 if provider_mappings:

7663 compute_utils.update_pci_request_with_placement_allocations(

7664 context,

7665 self.reportclient,

7666 instance.pci_requests.requests,

7667 provider_mappings,

7668 )

7669

7670 accel_info = []

7671 if accel_uuids:

7672 try:

7673 accel_info = self._get_bound_arq_resources(

7674 context, instance, accel_uuids)

7675 except (Exception, eventlet.timeout.Timeout) as exc:

7676 LOG.exception('Failure getting accelerator requests '

7677 'with the exception: %s', exc,

7678 instance=instance)

7679 self._build_resources_cleanup(instance, None)

7680 raise

7681

7682 with self.rt.instance_claim(context, instance, node, allocations,

7683 limits):

7684 self.network_api.setup_instance_network_on_host(

7685 context, instance, self.host,

7686 provider_mappings=provider_mappings)

7687 network_info = self.network_api.get_instance_nw_info(

7688 context, instance)

7689

7690 self.driver.spawn(context, instance, image_meta,

7691 injected_files=[],

7692 admin_password=None,

7693 allocations=allocations,

7694 network_info=network_info,

7695 block_device_info=block_device_info,

7696 accel_info=accel_info)

7697 except Exception:

7698 with excutils.save_and_reraise_exception(logger=LOG):

7699 LOG.exception('Instance failed to spawn',

7700 instance=instance)

7701 # Set the image_ref back to initial image_ref because instance

7702 # object might have been saved with image['id']

7703 # https://bugs.launchpad.net/nova/+bug/1934094

7704 instance.image_ref = shelved_image_ref

7705 # Cleanup allocations created by the scheduler on this host

7706 # since we failed to spawn the instance. We do this both if

7707 # the instance claim failed with ComputeResourcesUnavailable

7708 # or if we did claim but the spawn failed, because aborting the

7709 # instance claim will not remove the allocations.

7710 self.reportclient.delete_allocation_for_instance(

7711 context, instance.uuid, force=True)

7712 # FIXME: Umm, shouldn't we be rolling back port bindings too?

7713 self._terminate_volume_connections(context, instance, bdms)

7714 # The reverts_task_state decorator on unshelve_instance will

7715 # eventually save these updates.

7716 self._nil_out_instance_obj_host_and_node(instance)

7717

7718 if image:

7719 instance.image_ref = shelved_image_ref

7720 self._delete_snapshot_of_shelved_instance(context, instance,

7721 image['id'])

7722

7723 self._unshelve_instance_key_restore(instance, scrubbed_keys)

7724 self._update_instance_after_spawn(instance)

7725 # Delete system_metadata for a shelved instance

7726 compute_utils.remove_shelved_keys_from_system_metadata(instance)

7727

7728 instance.save(expected_task_state=task_states.SPAWNING)

7729 self._update_scheduler_instance_info(context, instance)

7730 self._notify_about_instance_usage(context, instance, 'unshelve.end')

7731 compute_utils.notify_about_instance_action(context, instance,

7732 self.host, action=fields.NotificationAction.UNSHELVE,

7733 phase=fields.NotificationPhase.END, bdms=bdms)

7734

7735 def _inject_network_info(self, instance, network_info):

7736 """Inject network info for the given instance."""

7737 LOG.debug('Inject network info', instance=instance)

7738 LOG.debug('network_info to inject: |%s|', network_info,

7739 instance=instance)

7740

7741 self.driver.inject_network_info(instance, network_info)

7742

7743 @wrap_instance_fault

7744 def inject_network_info(self, context, instance):

7745 """Inject network info, but don't return the info."""

7746 network_info = self.network_api.get_instance_nw_info(context, instance)

7747 self._inject_network_info(instance, network_info)

7748

7749 @messaging.expected_exceptions(NotImplementedError,

7750 exception.ConsoleNotAvailable,

7751 exception.InstanceNotFound)

7752 @wrap_exception()

7753 @wrap_instance_fault

7754 def get_console_output(self, context, instance, tail_length):

7755 """Send the console output for the given instance."""

7756 context = context.elevated()

7757 LOG.info("Get console output", instance=instance)

7758 output = self.driver.get_console_output(context, instance)

7759

7760 if type(output) is str: 7760 ↛ 7763line 7760 didn't jump to line 7763 because the condition on line 7760 was always true

7761 output = output.encode("latin-1")

7762

7763 if tail_length is not None:

7764 output = self._tail_log(output, tail_length)

7765

7766 return output.decode('ascii', 'replace')

7767

7768 def _tail_log(self, log, length):

7769 try:

7770 length = int(length)

7771 except ValueError:

7772 length = 0

7773

7774 if length == 0: 7774 ↛ 7775line 7774 didn't jump to line 7775 because the condition on line 7774 was never true

7775 return b''

7776 else:

7777 return b'\n'.join(log.split(b'\n')[-int(length):])

7778

7779 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,

7780 exception.InstanceNotReady,

7781 exception.InstanceNotFound,

7782 exception.ConsoleTypeUnavailable,

7783 NotImplementedError)

7784 @wrap_exception()

7785 @wrap_instance_fault

7786 def get_vnc_console(self, context, console_type, instance):

7787 """Return connection information for a vnc console."""

7788 context = context.elevated()

7789 LOG.debug("Getting vnc console", instance=instance)

7790

7791 if not CONF.vnc.enabled:

7792 raise exception.ConsoleTypeUnavailable(console_type=console_type)

7793

7794 if console_type == 'novnc':

7795 # For essex, novncproxy_base_url must include the full path

7796 # including the html file (like http://myhost/vnc_auto.html)

7797 access_url_base = CONF.vnc.novncproxy_base_url

7798 else:

7799 raise exception.ConsoleTypeInvalid(console_type=console_type)

7800

7801 try:

7802 # Retrieve connect info from driver, and then decorate with our

7803 # access info token

7804 console = self.driver.get_vnc_console(context, instance)

7805 console_auth = objects.ConsoleAuthToken(

7806 context=context,

7807 console_type=console_type,

7808 host=console.host,

7809 port=console.port,

7810 internal_access_path=console.internal_access_path,

7811 instance_uuid=instance.uuid,

7812 access_url_base=access_url_base,

7813 )

7814 console_auth.authorize(CONF.consoleauth.token_ttl)

7815 connect_info = console.get_connection_info(

7816 console_auth.token, console_auth.access_url)

7817

7818 except exception.InstanceNotFound:

7819 if instance.vm_state != vm_states.BUILDING: 7819 ↛ 7820line 7819 didn't jump to line 7820 because the condition on line 7819 was never true

7820 raise

7821 raise exception.InstanceNotReady(instance_id=instance.uuid)

7822

7823 return connect_info

7824

7825 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,

7826 exception.InstanceNotReady,

7827 exception.InstanceNotFound,

7828 exception.ConsoleTypeUnavailable,

7829 NotImplementedError)

7830 @wrap_exception()

7831 @wrap_instance_fault

7832 def get_spice_console(self, context, console_type, instance):

7833 """Return connection information for a spice console."""

7834 context = context.elevated()

7835 LOG.debug("Getting spice console", instance=instance)

7836

7837 if not CONF.spice.enabled:

7838 raise exception.ConsoleTypeUnavailable(console_type=console_type)

7839

7840 if console_type not in ['spice-html5', 'spice-direct']:

7841 raise exception.ConsoleTypeInvalid(console_type=console_type)

7842

7843 try:

7844 # Retrieve connect info from driver, and then decorate with our

7845 # access info token

7846 console = self.driver.get_spice_console(context, instance)

7847 fields = {

7848 'context': context,

7849 'console_type': console_type,

7850 'host': console.host,

7851 'port': console.port,

7852 'tls_port': console.tlsPort,

7853 'instance_uuid': instance.uuid

7854 }

7855 if console_type == 'spice-html5': 7855 ↛ 7858line 7855 didn't jump to line 7858 because the condition on line 7855 was always true

7856 fields['internal_access_path'] = console.internal_access_path

7857 fields['access_url_base'] = CONF.spice.html5proxy_base_url

7858 if console_type == 'spice-direct': 7858 ↛ 7859line 7858 didn't jump to line 7859 because the condition on line 7858 was never true

7859 fields['internal_access_path'] = None

7860 fields['access_url_base'] = \

7861 CONF.spice.spice_direct_proxy_base_url

7862

7863 console_auth = objects.ConsoleAuthToken(**fields)

7864 console_auth.authorize(CONF.consoleauth.token_ttl)

7865 connect_info = console.get_connection_info(

7866 console_auth.token, console_auth.access_url)

7867

7868 except exception.InstanceNotFound:

7869 if instance.vm_state != vm_states.BUILDING: 7869 ↛ 7870line 7869 didn't jump to line 7870 because the condition on line 7869 was never true

7870 raise

7871 raise exception.InstanceNotReady(instance_id=instance.uuid)

7872

7873 return connect_info

7874

7875 # TODO(gmann): HyperV virt driver has been removed in Nova 29.0.0

7876 # but we need to keep this method to avoid RPC error in case of using

7877 # old controller with new compute. This can be removed in RPC API 7.0

7878 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,

7879 exception.InstanceNotReady,

7880 exception.InstanceNotFound,

7881 exception.ConsoleTypeUnavailable,

7882 NotImplementedError)

7883 @wrap_exception()

7884 @wrap_instance_fault

7885 def get_rdp_console(self, context, console_type, instance):

7886 """Return connection information for a RDP console."""

7887

7888 msg = ("RDP console is applicable for HyperV virt driver only which "

7889 "has been removed in Nova 29.0.0")

7890 raise NotImplementedError(msg)

7891

7892 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,

7893 exception.InstanceNotReady,

7894 exception.InstanceNotFound,

7895 exception.ConsoleTypeUnavailable,

7896 NotImplementedError)

7897 @wrap_exception()

7898 @wrap_instance_fault

7899 def get_mks_console(self, context, console_type, instance):

7900 """Return connection information for a MKS console."""

7901 context = context.elevated()

7902 LOG.debug("Getting MKS console", instance=instance)

7903

7904 if not CONF.mks.enabled: 7904 ↛ 7905line 7904 didn't jump to line 7905 because the condition on line 7904 was never true

7905 raise exception.ConsoleTypeUnavailable(console_type=console_type)

7906

7907 if console_type != 'webmks': 7907 ↛ 7908line 7907 didn't jump to line 7908 because the condition on line 7907 was never true

7908 raise exception.ConsoleTypeInvalid(console_type=console_type)

7909

7910 try:

7911 # Retrieve connect info from driver, and then decorate with our

7912 # access info token

7913 console = self.driver.get_mks_console(context, instance)

7914 console_auth = objects.ConsoleAuthToken(

7915 context=context,

7916 console_type=console_type,

7917 host=console.host,

7918 port=console.port,

7919 internal_access_path=console.internal_access_path,

7920 instance_uuid=instance.uuid,

7921 access_url_base=CONF.mks.mksproxy_base_url,

7922 )

7923 console_auth.authorize(CONF.consoleauth.token_ttl)

7924 connect_info = console.get_connection_info(

7925 console_auth.token, console_auth.access_url)

7926

7927 except exception.InstanceNotFound:

7928 if instance.vm_state != vm_states.BUILDING:

7929 raise

7930 raise exception.InstanceNotReady(instance_id=instance.uuid)

7931

7932 return connect_info

7933

7934 @messaging.expected_exceptions(

7935 exception.ConsoleTypeInvalid,

7936 exception.InstanceNotReady,

7937 exception.InstanceNotFound,

7938 exception.ConsoleTypeUnavailable,

7939 exception.SocketPortRangeExhaustedException,

7940 exception.ImageSerialPortNumberInvalid,

7941 exception.ImageSerialPortNumberExceedFlavorValue,

7942 NotImplementedError)

7943 @wrap_exception()

7944 @wrap_instance_fault

7945 def get_serial_console(self, context, console_type, instance):

7946 """Returns connection information for a serial console."""

7947

7948 LOG.debug("Getting serial console", instance=instance)

7949

7950 if not CONF.serial_console.enabled: 7950 ↛ 7951line 7950 didn't jump to line 7951 because the condition on line 7950 was never true

7951 raise exception.ConsoleTypeUnavailable(console_type=console_type)

7952

7953 context = context.elevated()

7954

7955 try:

7956 # Retrieve connect info from driver, and then decorate with our

7957 # access info token

7958 console = self.driver.get_serial_console(context, instance)

7959 console_auth = objects.ConsoleAuthToken(

7960 context=context,

7961 console_type=console_type,

7962 host=console.host,

7963 port=console.port,

7964 internal_access_path=console.internal_access_path,

7965 instance_uuid=instance.uuid,

7966 access_url_base=CONF.serial_console.base_url,

7967 )

7968 console_auth.authorize(CONF.consoleauth.token_ttl)

7969 connect_info = console.get_connection_info(

7970 console_auth.token, console_auth.access_url)

7971

7972 except exception.InstanceNotFound:

7973 if instance.vm_state != vm_states.BUILDING:

7974 raise

7975 raise exception.InstanceNotReady(instance_id=instance.uuid)

7976

7977 return connect_info

7978

7979 @messaging.expected_exceptions(exception.ConsoleTypeInvalid,

7980 exception.InstanceNotReady,

7981 exception.InstanceNotFound)

7982 @wrap_exception()

7983 @wrap_instance_fault

7984 def validate_console_port(self, ctxt, instance, port, console_type):

7985 if console_type in ["spice-html5", "spice-direct"]:

7986 console_info = self.driver.get_spice_console(ctxt, instance)

7987 elif console_type == "serial":

7988 console_info = self.driver.get_serial_console(ctxt, instance)

7989 elif console_type == "webmks":

7990 console_info = self.driver.get_mks_console(ctxt, instance)

7991 else:

7992 console_info = self.driver.get_vnc_console(ctxt, instance)

7993

7994 # Some drivers may return an int on console_info.port but the port

7995 # variable in this method is a string, so cast to be sure we are

7996 # comparing the correct types.

7997 return str(console_info.port) == port

7998

7999 @wrap_exception()

8000 @reverts_task_state

8001 @wrap_instance_fault

8002 def reserve_block_device_name(self, context, instance, device,

8003 volume_id, disk_bus, device_type, tag,

8004 multiattach):

8005 if (tag and not

8006 self.driver.capabilities.get('supports_tagged_attach_volume',

8007 False)):

8008 raise exception.VolumeTaggedAttachNotSupported()

8009

8010 if (multiattach and not

8011 self.driver.capabilities.get('supports_multiattach', False)):

8012 raise exception.MultiattachNotSupportedByVirtDriver(

8013 volume_id=volume_id)

8014

8015 @utils.synchronized(instance.uuid)

8016 def do_reserve():

8017 bdms = (

8018 objects.BlockDeviceMappingList.get_by_instance_uuid(

8019 context, instance.uuid))

8020

8021 # Now that we have the lock check that we haven't raced another

8022 # request and ensure there is no existing attachment

8023 if any(b for b in bdms if b.volume_id == volume_id):

8024 msg = _("volume %s already attached") % volume_id

8025 raise exception.InvalidVolume(reason=msg)

8026

8027 # NOTE(ndipanov): We need to explicitly set all the fields on the

8028 # object so that obj_load_attr does not fail

8029 new_bdm = objects.BlockDeviceMapping(

8030 context=context,

8031 source_type='volume', destination_type='volume',

8032 instance_uuid=instance.uuid, boot_index=None,

8033 volume_id=volume_id,

8034 device_name=device, guest_format=None,

8035 disk_bus=disk_bus, device_type=device_type, tag=tag)

8036

8037 new_bdm.device_name = self._get_device_name_for_instance(

8038 instance, bdms, new_bdm)

8039

8040 # NOTE(vish): create bdm here to avoid race condition

8041 new_bdm.create()

8042 return new_bdm

8043

8044 return do_reserve()

8045

8046 @wrap_exception()

8047 @wrap_instance_event(prefix='compute')

8048 @wrap_instance_fault

8049 def attach_volume(self, context, instance, bdm):

8050 """Attach a volume to an instance."""

8051 driver_bdm = driver_block_device.convert_volume(bdm)

8052

8053 @utils.synchronized(instance.uuid)

8054 def do_attach_volume(context, instance, driver_bdm):

8055 try:

8056 return self._attach_volume(context, instance, driver_bdm)

8057 except Exception:

8058 with excutils.save_and_reraise_exception():

8059 bdm.destroy()

8060

8061 do_attach_volume(context, instance, driver_bdm)

8062

8063 def _attach_volume(self, context, instance, bdm):

8064 context = context.elevated()

8065 LOG.info('Attaching volume %(volume_id)s to %(mountpoint)s',

8066 {'volume_id': bdm.volume_id,

8067 'mountpoint': bdm['mount_device']},

8068 instance=instance)

8069 compute_utils.notify_about_volume_attach_detach(

8070 context, instance, self.host,

8071 action=fields.NotificationAction.VOLUME_ATTACH,

8072 phase=fields.NotificationPhase.START,

8073 volume_id=bdm.volume_id)

8074 try:

8075 bdm.attach(context, instance, self.volume_api, self.driver,

8076 do_driver_attach=True)

8077 except Exception as e:

8078 with excutils.save_and_reraise_exception():

8079 LOG.exception("Failed to attach %(volume_id)s "

8080 "at %(mountpoint)s",

8081 {'volume_id': bdm.volume_id,

8082 'mountpoint': bdm['mount_device']},

8083 instance=instance)

8084 if bdm['attachment_id']:

8085 # Try to delete the attachment to make the volume

8086 # available again. Note that DriverVolumeBlockDevice

8087 # may have already deleted the attachment so ignore

8088 # VolumeAttachmentNotFound.

8089 try:

8090 self.volume_api.attachment_delete(

8091 context, bdm['attachment_id'])

8092 except exception.VolumeAttachmentNotFound as exc:

8093 LOG.debug('Ignoring VolumeAttachmentNotFound: %s',

8094 exc, instance=instance)

8095 else:

8096 self.volume_api.unreserve_volume(context, bdm.volume_id)

8097 compute_utils.notify_about_volume_attach_detach(

8098 context, instance, self.host,

8099 action=fields.NotificationAction.VOLUME_ATTACH,

8100 phase=fields.NotificationPhase.ERROR,

8101 exception=e,

8102 volume_id=bdm.volume_id)

8103

8104 info = {'volume_id': bdm.volume_id}

8105 self._notify_about_instance_usage(

8106 context, instance, "volume.attach", extra_usage_info=info)

8107 compute_utils.notify_about_volume_attach_detach(

8108 context, instance, self.host,

8109 action=fields.NotificationAction.VOLUME_ATTACH,

8110 phase=fields.NotificationPhase.END,

8111 volume_id=bdm.volume_id)

8112

8113 def _notify_volume_usage_detach(self, context, instance, bdm):

8114 if CONF.volume_usage_poll_interval <= 0:

8115 return

8116

8117 mp = bdm.device_name

8118 # Handle bootable volumes which will not contain /dev/

8119 if '/dev/' in mp: 8119 ↛ 8121line 8119 didn't jump to line 8121 because the condition on line 8119 was always true

8120 mp = mp[5:]

8121 try:

8122 vol_stats = self.driver.block_stats(instance, mp)

8123 if vol_stats is None:

8124 return

8125 except NotImplementedError:

8126 return

8127

8128 LOG.debug("Updating volume usage cache with totals", instance=instance)

8129 rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats

8130 vol_usage = objects.VolumeUsage(context)

8131 vol_usage.volume_id = bdm.volume_id

8132 vol_usage.instance_uuid = instance.uuid

8133 vol_usage.project_id = instance.project_id

8134 vol_usage.user_id = instance.user_id

8135 vol_usage.availability_zone = instance.availability_zone

8136 vol_usage.curr_reads = rd_req

8137 vol_usage.curr_read_bytes = rd_bytes

8138 vol_usage.curr_writes = wr_req

8139 vol_usage.curr_write_bytes = wr_bytes

8140 vol_usage.save(update_totals=True)

8141 self.notifier.info(context, 'volume.usage', vol_usage.to_dict())

8142 compute_utils.notify_about_volume_usage(context, vol_usage, self.host)

8143

8144 def _detach_volume(self, context, bdm, instance, destroy_bdm=True,

8145 attachment_id=None):

8146 """Detach a volume from an instance.

8147

8148 :param context: security context

8149 :param bdm: nova.objects.BlockDeviceMapping volume bdm to detach

8150 :param instance: the Instance object to detach the volume from

8151 :param destroy_bdm: if True, the corresponding BDM entry will be marked

8152 as deleted. Disabling this is useful for operations

8153 like rebuild, when we don't want to destroy BDM

8154 :param attachment_id: The volume attachment_id for the given instance

8155 and volume.

8156 """

8157 volume_id = bdm.volume_id

8158 compute_utils.notify_about_volume_attach_detach(

8159 context, instance, self.host,

8160 action=fields.NotificationAction.VOLUME_DETACH,

8161 phase=fields.NotificationPhase.START,

8162 volume_id=volume_id)

8163

8164 self._notify_volume_usage_detach(context, instance, bdm)

8165

8166 LOG.info('Detaching volume %(volume_id)s',

8167 {'volume_id': volume_id}, instance=instance)

8168

8169 driver_bdm = driver_block_device.convert_volume(bdm)

8170 driver_bdm.detach(context, instance, self.volume_api, self.driver,

8171 attachment_id=attachment_id, destroy_bdm=destroy_bdm)

8172

8173 info = dict(volume_id=volume_id)

8174 self._notify_about_instance_usage(

8175 context, instance, "volume.detach", extra_usage_info=info)

8176 compute_utils.notify_about_volume_attach_detach(

8177 context, instance, self.host,

8178 action=fields.NotificationAction.VOLUME_DETACH,

8179 phase=fields.NotificationPhase.END,

8180 volume_id=volume_id)

8181

8182 if 'tag' in bdm and bdm.tag:

8183 self._delete_disk_metadata(instance, bdm)

8184 if destroy_bdm:

8185 bdm.destroy()

8186

8187 def _delete_disk_metadata(self, instance, bdm):

8188 for device in instance.device_metadata.devices:

8189 if isinstance(device, objects.DiskMetadata): 8189 ↛ 8188line 8189 didn't jump to line 8188 because the condition on line 8189 was always true

8190 if 'serial' in device:

8191 if device.serial == bdm.volume_id: 8191 ↛ 8188line 8191 didn't jump to line 8188 because the condition on line 8191 was always true

8192 instance.device_metadata.devices.remove(device)

8193 instance.save()

8194 break

8195 else:

8196 # NOTE(artom) We log the entire device object because all

8197 # fields are nullable and may not be set

8198 LOG.warning('Unable to determine whether to clean up '

8199 'device metadata for disk %s', device,

8200 instance=instance)

8201

8202 @wrap_exception()

8203 @wrap_instance_event(prefix='compute')

8204 @wrap_instance_fault

8205 def detach_volume(self, context, volume_id, instance, attachment_id):

8206 """Detach a volume from an instance.

8207

8208 :param context: security context

8209 :param volume_id: the volume id

8210 :param instance: the Instance object to detach the volume from

8211 :param attachment_id: The volume attachment_id for the given instance

8212 and volume.

8213

8214 """

8215 @utils.synchronized(instance.uuid)

8216 def do_detach_volume(context, volume_id, instance, attachment_id):

8217 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(

8218 context, volume_id, instance.uuid)

8219 self._detach_volume(context, bdm, instance,

8220 attachment_id=attachment_id)

8221

8222 do_detach_volume(context, volume_id, instance, attachment_id)

8223

8224 def _init_volume_connection(self, context, new_volume,

8225 old_volume_id, connector, bdm,

8226 new_attachment_id, mountpoint):

8227 new_volume_id = new_volume['id']

8228 if new_attachment_id is None:

8229 # We're dealing with an old-style attachment so initialize the

8230 # connection so we can get the connection_info.

8231 new_cinfo = self.volume_api.initialize_connection(context,

8232 new_volume_id,

8233 connector)

8234 else:

8235 # Check for multiattach on the new volume and if True, check to

8236 # see if the virt driver supports multiattach.

8237 # TODO(mriedem): This is copied from DriverVolumeBlockDevice

8238 # and should be consolidated into some common code at some point.

8239 vol_multiattach = new_volume.get('multiattach', False)

8240 virt_multiattach = self.driver.capabilities.get(

8241 'supports_multiattach', False)

8242 if vol_multiattach and not virt_multiattach:

8243 raise exception.MultiattachNotSupportedByVirtDriver(

8244 volume_id=new_volume_id)

8245

8246 # This is a new style attachment and the API created the new

8247 # volume attachment and passed the id to the compute over RPC.

8248 # At this point we need to update the new volume attachment with

8249 # the host connector, which will give us back the new attachment

8250 # connection_info.

8251 new_cinfo = self.volume_api.attachment_update(

8252 context, new_attachment_id, connector,

8253 mountpoint)['connection_info']

8254

8255 if vol_multiattach:

8256 # This will be used by the volume driver to determine the

8257 # proper disk configuration.

8258 new_cinfo['multiattach'] = True

8259

8260 old_cinfo = jsonutils.loads(bdm['connection_info'])

8261 if old_cinfo and 'serial' not in old_cinfo: 8261 ↛ 8266line 8261 didn't jump to line 8266 because the condition on line 8261 was always true

8262 old_cinfo['serial'] = old_volume_id

8263 # NOTE(lyarwood): serial is not always present in the returned

8264 # connection_info so set it if it is missing as we do in

8265 # DriverVolumeBlockDevice.attach().

8266 if 'serial' not in new_cinfo: 8266 ↛ 8268line 8266 didn't jump to line 8268 because the condition on line 8266 was always true

8267 new_cinfo['serial'] = new_volume_id

8268 return (old_cinfo, new_cinfo)

8269

8270 def _swap_volume(self, context, instance, bdm, connector,

8271 old_volume_id, new_volume, resize_to,

8272 new_attachment_id, is_cinder_migration):

8273 new_volume_id = new_volume['id']

8274 mountpoint = bdm['device_name']

8275 failed = False

8276 new_cinfo = None

8277 try:

8278 old_cinfo, new_cinfo = self._init_volume_connection(

8279 context, new_volume, old_volume_id, connector,

8280 bdm, new_attachment_id, mountpoint)

8281 # NOTE(lyarwood): The Libvirt driver, the only virt driver

8282 # currently implementing swap_volume, will modify the contents of

8283 # new_cinfo when connect_volume is called. This is then saved to

8284 # the BDM in swap_volume for future use outside of this flow.

8285 msg = ("swap_volume: Calling driver volume swap with "

8286 "connection infos: new: %(new_cinfo)s; "

8287 "old: %(old_cinfo)s" %

8288 {'new_cinfo': new_cinfo, 'old_cinfo': old_cinfo})

8289 # Both new and old info might contain password

8290 LOG.debug(strutils.mask_password(msg), instance=instance)

8291

8292 self.driver.swap_volume(context, old_cinfo, new_cinfo, instance,

8293 mountpoint, resize_to)

8294 if new_attachment_id:

8295 self.volume_api.attachment_complete(context, new_attachment_id)

8296 msg = ("swap_volume: Driver volume swap returned, new "

8297 "connection_info is now : %(new_cinfo)s" %

8298 {'new_cinfo': new_cinfo})

8299 LOG.debug(strutils.mask_password(msg))

8300 except Exception as ex:

8301 failed = True

8302 with excutils.save_and_reraise_exception():

8303 compute_utils.notify_about_volume_swap(

8304 context, instance, self.host,

8305 fields.NotificationPhase.ERROR,

8306 old_volume_id, new_volume_id, ex)

8307 if new_cinfo:

8308 msg = ("Failed to swap volume %(old_volume_id)s "

8309 "for %(new_volume_id)s")

8310 LOG.exception(msg, {'old_volume_id': old_volume_id,

8311 'new_volume_id': new_volume_id},

8312 instance=instance)

8313 else:

8314 msg = ("Failed to connect to volume %(volume_id)s "

8315 "with volume at %(mountpoint)s")

8316 LOG.exception(msg, {'volume_id': new_volume_id,

8317 'mountpoint': bdm['device_name']},

8318 instance=instance)

8319

8320 # The API marked the volume as 'detaching' for the old volume

8321 # so we need to roll that back so the volume goes back to

8322 # 'in-use' state.

8323 self.volume_api.roll_detaching(context, old_volume_id)

8324

8325 if new_attachment_id is None:

8326 # The API reserved the new volume so it would be in

8327 # 'attaching' status, so we need to unreserve it so it

8328 # goes back to 'available' status.

8329 self.volume_api.unreserve_volume(context, new_volume_id)

8330 else:

8331 # This is a new style attachment for the new volume, which

8332 # was created in the API. We just need to delete it here

8333 # to put the new volume back into 'available' status.

8334 self.volume_api.attachment_delete(

8335 context, new_attachment_id)

8336 finally:

8337 # TODO(mriedem): This finally block is terribly confusing and is

8338 # trying to do too much. We should consider removing the finally

8339 # block and move whatever needs to happen on success and failure

8340 # into the blocks above for clarity, even if it means a bit of

8341 # redundant code.

8342 conn_volume = new_volume_id if failed else old_volume_id

8343 if new_cinfo:

8344 LOG.debug("swap_volume: removing Cinder connection "

8345 "for volume %(volume)s", {'volume': conn_volume},

8346 instance=instance)

8347 if bdm.attachment_id is None:

8348 # This is the pre-3.44 flow for new-style volume

8349 # attachments so just terminate the connection.

8350 self.volume_api.terminate_connection(context,

8351 conn_volume,

8352 connector)

8353 else:

8354 # This is a new style volume attachment. If we failed, then

8355 # the new attachment was already deleted above in the

8356 # exception block and we have nothing more to do here. If

8357 # swap_volume was successful in the driver, then we need to

8358 # "detach" the original attachment by deleting it.

8359 if not failed:

8360 self.volume_api.attachment_delete(

8361 context, bdm.attachment_id)

8362

8363 # Need to make some decisions based on whether this was

8364 # a Cinder initiated migration or not. The callback to

8365 # migration completion isn't needed in the case of a

8366 # nova initiated simple swap of two volume

8367 # "volume-update" call so skip that. The new attachment

8368 # scenarios will give us a new attachment record and

8369 # that's what we want.

8370 if bdm.attachment_id and not is_cinder_migration:

8371 # we don't callback to cinder

8372 comp_ret = {'save_volume_id': new_volume_id}

8373 else:

8374 # NOTE(lyarwood): The following call to

8375 # os-migrate-volume-completion returns a dict containing

8376 # save_volume_id, this volume id has two possible values :

8377 # 1. old_volume_id if we are migrating (retyping) volumes

8378 # 2. new_volume_id if we are swapping between two existing

8379 # volumes

8380 # This volume id is later used to update the volume_id and

8381 # connection_info['serial'] of the BDM.

8382 comp_ret = self.volume_api.migrate_volume_completion(

8383 context,

8384 old_volume_id,

8385 new_volume_id,

8386 error=failed)

8387 LOG.debug("swap_volume: Cinder migrate_volume_completion "

8388 "returned: %(comp_ret)s", {'comp_ret': comp_ret},

8389 instance=instance)

8390

8391 return (comp_ret, new_cinfo)

8392

8393 @wrap_exception()

8394 @wrap_instance_event(prefix='compute')

8395 @wrap_instance_fault

8396 def swap_volume(self, context, old_volume_id, new_volume_id, instance,

8397 new_attachment_id):

8398 """Replace the old volume with the new volume within the active server

8399

8400 :param context: User request context

8401 :param old_volume_id: Original volume id

8402 :param new_volume_id: New volume id being swapped to

8403 :param instance: Instance with original_volume_id attached

8404 :param new_attachment_id: ID of the new attachment for new_volume_id

8405 """

8406 @utils.synchronized(instance.uuid)

8407 def _do_locked_swap_volume(context, old_volume_id, new_volume_id,

8408 instance, new_attachment_id):

8409 self._do_swap_volume(context, old_volume_id, new_volume_id,

8410 instance, new_attachment_id)

8411 _do_locked_swap_volume(context, old_volume_id, new_volume_id, instance,

8412 new_attachment_id)

8413

8414 def _do_swap_volume(self, context, old_volume_id, new_volume_id,

8415 instance, new_attachment_id):

8416 """Replace the old volume with the new volume within the active server

8417

8418 :param context: User request context

8419 :param old_volume_id: Original volume id

8420 :param new_volume_id: New volume id being swapped to

8421 :param instance: Instance with original_volume_id attached

8422 :param new_attachment_id: ID of the new attachment for new_volume_id

8423 """

8424 context = context.elevated()

8425 compute_utils.notify_about_volume_swap(

8426 context, instance, self.host,

8427 fields.NotificationPhase.START,

8428 old_volume_id, new_volume_id)

8429

8430 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(

8431 context, old_volume_id, instance.uuid)

8432 connector = self.driver.get_volume_connector(instance)

8433

8434 resize_to = 0

8435 old_volume = self.volume_api.get(context, old_volume_id)

8436 # Yes this is a tightly-coupled state check of what's going on inside

8437 # cinder, but we need this while we still support old (v1/v2) and

8438 # new style attachments (v3.44). Once we drop support for old style

8439 # attachments we could think about cleaning up the cinder-initiated

8440 # swap volume API flows.

8441 is_cinder_migration = False

8442 if 'migration_status' in old_volume:

8443 is_cinder_migration = old_volume['migration_status'] == 'migrating'

8444 old_vol_size = old_volume['size']

8445 new_volume = self.volume_api.get(context, new_volume_id)

8446 new_vol_size = new_volume['size']

8447 if new_vol_size > old_vol_size:

8448 resize_to = new_vol_size

8449

8450 LOG.info('Swapping volume %(old_volume)s for %(new_volume)s',

8451 {'old_volume': old_volume_id, 'new_volume': new_volume_id},

8452 instance=instance)

8453 comp_ret, new_cinfo = self._swap_volume(context,

8454 instance,

8455 bdm,

8456 connector,

8457 old_volume_id,

8458 new_volume,

8459 resize_to,

8460 new_attachment_id,

8461 is_cinder_migration)

8462

8463 # NOTE(lyarwood): Update the BDM with the modified new_cinfo and

8464 # correct volume_id returned by Cinder.

8465 save_volume_id = comp_ret['save_volume_id']

8466

8467 # NOTE(lyarwood): Overwrite the possibly stale serial and volume_id in

8468 # the connection_info with the volume_id returned from Cinder. This

8469 # could be the case during a volume migration where the new_cinfo here

8470 # refers to the temporary volume *before* Cinder renames it to the

8471 # original volume UUID at the end of the migration.

8472 new_cinfo['serial'] = save_volume_id

8473 new_cinfo['volume_id'] = save_volume_id

8474 if 'data' in new_cinfo:

8475 new_cinfo['data']['volume_id'] = save_volume_id

8476 values = {

8477 'connection_info': jsonutils.dumps(new_cinfo),

8478 'source_type': 'volume',

8479 'destination_type': 'volume',

8480 'snapshot_id': None,

8481 'volume_id': save_volume_id,

8482 'no_device': None}

8483

8484 if resize_to:

8485 values['volume_size'] = resize_to

8486

8487 if new_attachment_id is not None:

8488 # This was a volume swap for a new-style attachment so we

8489 # need to update the BDM attachment_id for the new attachment.

8490 values['attachment_id'] = new_attachment_id

8491

8492 LOG.debug("swap_volume: Updating volume %(volume_id)s BDM record with "

8493 "%(updates)s", {'volume_id': bdm.volume_id,

8494 'updates': values},

8495 instance=instance)

8496 bdm.update(values)

8497 bdm.save()

8498

8499 compute_utils.notify_about_volume_swap(

8500 context, instance, self.host,

8501 fields.NotificationPhase.END,

8502 old_volume_id, new_volume_id)

8503

8504 @wrap_exception()

8505 def remove_volume_connection(

8506 self, context, volume_id, instance,

8507 delete_attachment=False):

8508 """Remove the volume connection on this host

8509

8510 Detach the volume from this instance on this host, and if this is

8511 the cinder v2 flow, call cinder to terminate the connection.

8512 """

8513 try:

8514 # NOTE(mriedem): If the BDM was just passed directly we would not

8515 # need to do this DB query, but this is an RPC interface so

8516 # changing that requires some care.

8517 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(

8518 context, volume_id, instance.uuid)

8519 # NOTE(mriedem): Normally we would pass delete_attachment=True to

8520 # _remove_volume_connection to delete a v3 style volume attachment,

8521 # but this method is RPC called from _rollback_live_migration which

8522 # already deletes the attachment, so because of that tight coupling

8523 # we cannot simply delete a v3 style attachment here without

8524 # needing to do some behavior modification of that

8525 # _rollback_live_migration flow which gets messy.

8526 self._remove_volume_connection(

8527 context, bdm, instance, delete_attachment)

8528 except exception.NotFound:

8529 pass

8530

8531 def _remove_volume_connection(self, context, bdm, instance,

8532 delete_attachment=False):

8533 """Remove the volume connection on this host

8534

8535 Detach the volume from this instance on this host.

8536

8537 :param context: nova auth request context

8538 :param bdm: BlockDeviceMapping object for a volume attached to the

8539 instance

8540 :param instance: Instance object with a volume attached represented

8541 by ``bdm``

8542 :param delete_attachment: If ``bdm.attachment_id`` is not None the

8543 attachment was made as a cinder v3 style attachment and if True,

8544 then deletes the volume attachment, otherwise just terminates

8545 the connection for a cinder legacy style connection.

8546 """

8547 driver_bdm = driver_block_device.convert_volume(bdm)

8548 driver_bdm.driver_detach(context, instance,

8549 self.volume_api, self.driver)

8550 if bdm.attachment_id is None:

8551 # cinder v2 api flow

8552 connector = self.driver.get_volume_connector(instance)

8553 self.volume_api.terminate_connection(context, bdm.volume_id,

8554 connector)

8555 elif delete_attachment:

8556 # cinder v3 api flow

8557 self.volume_api.attachment_delete(context, bdm.attachment_id)

8558

8559 def _deallocate_port_resource_for_instance(

8560 self,

8561 context: nova.context.RequestContext,

8562 instance: 'objects.Instance',

8563 port_id: str,

8564 port_allocation: ty.Dict[str, ty.Dict[str, ty.Dict[str, int]]],

8565 ) -> None:

8566

8567 if not port_allocation:

8568 return

8569

8570 try:

8571 client = self.reportclient

8572 client.remove_resources_from_instance_allocation(

8573 context, instance.uuid, port_allocation)

8574 except Exception as ex:

8575 # We always raise here as it is not a race condition where

8576 # somebody has already deleted the port we want to cleanup.

8577 # Here we see that the port exists, the allocation exists,

8578 # but we cannot clean it up so we will actually leak

8579 # allocations.

8580 with excutils.save_and_reraise_exception():

8581 LOG.warning(

8582 'Failed to remove resource allocation of port %(port_id)s '

8583 'for instance. Error: %(error)s',

8584 {'port_id': port_id, 'error': ex},

8585 instance=instance)

8586

8587 def _deallocate_port_for_instance(

8588 self, context, instance, port_id, raise_on_failure=False,

8589 pci_device=None):

8590 try:

8591 result = self.network_api.deallocate_port_for_instance(

8592 context, instance, port_id)

8593 __, port_allocation = result

8594 except Exception as ex:

8595 with excutils.save_and_reraise_exception(

8596 reraise=raise_on_failure):

8597 LOG.warning('Failed to deallocate port %(port_id)s '

8598 'for instance. Error: %(error)s',

8599 {'port_id': port_id, 'error': ex},

8600 instance=instance)

8601 else:

8602 if pci_device: 8602 ↛ 8608line 8602 didn't jump to line 8608 because the condition on line 8602 was always true

8603 self.rt.unclaim_pci_devices(context, pci_device, instance)

8604 instance.remove_pci_device_and_request(pci_device)

8605

8606 # Deallocate the resources in placement that were used by the

8607 # detached port.

8608 self._deallocate_port_resource_for_instance(

8609 context, instance, port_id, port_allocation)

8610

8611 def _claim_pci_device_for_interface_attach(

8612 self,

8613 context: nova.context.RequestContext,

8614 instance: 'objects.Instance',

8615 pci_reqs: 'objects.InstancePCIRequests',

8616 ) -> ty.Optional['objects.PciDevice']:

8617 """Claim PCI devices if there are PCI requests

8618

8619 :param context: nova.context.RequestContext

8620 :param instance: the objects.Instance to where the interface is being

8621 attached

8622 :param pci_reqs: A InstancePCIRequests object describing the

8623 needed PCI devices

8624 :raises InterfaceAttachPciClaimFailed: if the PCI device claim fails

8625 :returns: An objects.PciDevice describing the claimed PCI device for

8626 the interface or None if no device is requested

8627 """

8628

8629 if not pci_reqs.requests: 8629 ↛ 8630line 8629 didn't jump to line 8630 because the condition on line 8629 was never true

8630 return None

8631

8632 try:

8633 devices = self.rt.claim_pci_devices(

8634 context, pci_reqs, instance.numa_topology)

8635 except exception.PciDeviceRequestFailed:

8636 LOG.info('Failed to claim PCI devices during interface attach '

8637 'for PCI request %s', pci_reqs, instance=instance)

8638 raise exception.InterfaceAttachPciClaimFailed(

8639 instance_uuid=instance.uuid)

8640

8641 # NOTE(gibi): We assume that maximum one PCI devices is attached per

8642 # interface attach request.

8643 device = devices[0]

8644 instance.pci_devices.objects.append(device)

8645

8646 return device

8647

8648 def _allocate_port_resource_for_instance(

8649 self,

8650 context: nova.context.RequestContext,

8651 instance: 'objects.Instance',

8652 pci_reqs: 'objects.InstancePCIRequests',

8653 request_groups: ty.List['objects.RequestGroup'],

8654 request_level_params: 'objects.RequestLevelParams',

8655 ) -> ty.Tuple[ty.Optional[ty.Dict[str, ty.List[str]]],

8656 ty.Optional[ty.Dict[str, ty.Dict[str, ty.Dict[str, int]]]]]:

8657 """Allocate resources for the request in placement

8658

8659 :param context: nova.context.RequestContext

8660 :param instance: the objects.Instance to where the interface is being

8661 attached

8662 :param pci_reqs: A list of InstancePCIRequest objects describing the

8663 needed PCI devices

8664 :param request_groups: A list of RequestGroup objects describing the

8665 resources the port requests from placement

8666 :param request_level_params: A RequestLevelParams object describing the

8667 non group specific request of the port.

8668 :raises InterfaceAttachResourceAllocationFailed: if we failed to

8669 allocate resource in placement for the request

8670 :returns: A tuple of provider mappings and allocated resources or

8671 (None, None) if no resource allocation was needed for the request

8672 """

8673

8674 if not request_groups:

8675 return None, None

8676

8677 # restrict the resource request to the current compute node. The

8678 # compute node uuid is the uuid of the root provider of the node in

8679 # placement

8680 compute_node_uuid = objects.ComputeNode.get_by_nodename(

8681 context, instance.node).uuid

8682 # we can have multiple request groups, it would be enough to restrict

8683 # only one of them to the compute tree but for symmetry we restrict

8684 # all of them

8685 for request_group in request_groups:

8686 request_group.in_tree = compute_node_uuid

8687

8688 # NOTE(gibi): group policy is mandatory in a resource request if there

8689 # are multiple groups. The policy can only come from the flavor today

8690 # and a new flavor is not provided with an interface attach request and

8691 # the instance's current flavor might not have a policy. Still we are

8692 # attaching a single port where currently the two possible groups

8693 # (one for bandwidth and one for packet rate) will always be allocated

8694 # from different providers. So both possible policies (none, isolated)

8695 # are always fulfilled for this single port. We still has to specify

8696 # one so we specify the least restrictive now.

8697 rr = scheduler_utils.ResourceRequest.from_request_groups(

8698 request_groups, request_level_params, group_policy='none')

8699 res = self.reportclient.get_allocation_candidates(context, rr)

8700 alloc_reqs, provider_sums, version = res

8701

8702 if not alloc_reqs:

8703 # no allocation candidates available, we run out of free resources

8704 raise exception.InterfaceAttachResourceAllocationFailed(

8705 instance_uuid=instance.uuid)

8706

8707 # select one of the candidates and update the instance

8708 # allocation

8709 # TODO(gibi): We could loop over all possible candidates

8710 # if the first one selected here does not work due to race or due

8711 # to not having free PCI devices. However the latter is only

8712 # detected later in the interface attach code path.

8713 alloc_req = alloc_reqs[0]

8714 resources = alloc_req['allocations']

8715 provider_mappings = alloc_req['mappings']

8716 try:

8717 self.reportclient.add_resources_to_instance_allocation(

8718 context, instance.uuid, resources)

8719 except exception.AllocationUpdateFailed as e:

8720 # We lost a race. We could retry another candidate

8721 raise exception.InterfaceAttachResourceAllocationFailed(

8722 instance_uuid=instance.uuid) from e

8723 except (

8724 exception.ConsumerAllocationRetrievalFailed,

8725 keystone_exception.ClientException,

8726 ) as e:

8727 # These are non-recoverable errors so we should not retry

8728 raise exception.InterfaceAttachResourceAllocationFailed(

8729 instance_uuid=instance.uuid) from e

8730

8731 try:

8732 compute_utils.update_pci_request_with_placement_allocations(

8733 context,

8734 self.reportclient,

8735 pci_reqs.requests,

8736 provider_mappings,

8737 )

8738 except (

8739 exception.AmbiguousResourceProviderForPCIRequest,

8740 exception.UnexpectedResourceProviderNameForPCIRequest

8741 ):

8742 # These are programming errors. So we clean up an re-raise to let

8743 # the request fail

8744 with excutils.save_and_reraise_exception():

8745 self.reportclient.remove_resources_from_instance_allocation(

8746 context, instance.uuid, resources)

8747

8748 return provider_mappings, resources

8749

8750 # TODO(mriedem): There are likely race failures which can result in

8751 # NotFound and QuotaError exceptions getting traced as well.

8752 @messaging.expected_exceptions(

8753 # Do not log a traceback for user errors. We use Invalid generically

8754 # since this method can raise lots of different exceptions:

8755 # AttachInterfaceNotSupported

8756 # NetworkInterfaceTaggedAttachNotSupported

8757 # NetworkAmbiguous

8758 # PortNotUsable

8759 # PortInUse

8760 # PortNotUsableDNS

8761 # AttachSRIOVPortNotSupported

8762 # NetworksWithQoSPolicyNotSupported

8763 # InterfaceAttachResourceAllocationFailed

8764 exception.Invalid)

8765 @wrap_exception()

8766 @wrap_instance_event(prefix='compute')

8767 @wrap_instance_fault

8768 def attach_interface(self, context, instance, network_id, port_id,

8769 requested_ip, tag):

8770 """Use hotplug to add an network adapter to an instance."""

8771 lockname = 'interface-%s-%s' % (instance.uuid, port_id)

8772

8773 @utils.synchronized(lockname)

8774 def do_attach_interface(context, instance, network_id, port_id,

8775 requested_ip, tag):

8776 return self._attach_interface(context, instance, network_id,

8777 port_id, requested_ip, tag)

8778

8779 return do_attach_interface(context, instance, network_id, port_id,

8780 requested_ip, tag)

8781

8782 def _attach_interface(self, context, instance, network_id, port_id,

8783 requested_ip, tag):

8784 if not self.driver.capabilities.get('supports_attach_interface', 8784 ↛ 8786line 8784 didn't jump to line 8786 because the condition on line 8784 was never true

8785 False):

8786 raise exception.AttachInterfaceNotSupported(

8787 instance_uuid=instance.uuid)

8788 if (tag and not

8789 self.driver.capabilities.get('supports_tagged_attach_interface',

8790 False)):

8791 raise exception.NetworkInterfaceTaggedAttachNotSupported()

8792

8793 compute_utils.notify_about_instance_action(

8794 context, instance, self.host,

8795 action=fields.NotificationAction.INTERFACE_ATTACH,

8796 phase=fields.NotificationPhase.START)

8797

8798 bind_host_id = self.driver.network_binding_host_id(context, instance)

8799

8800 requested_networks = objects.NetworkRequestList(

8801 objects=[

8802 objects.NetworkRequest(

8803 network_id=network_id,

8804 port_id=port_id,

8805 address=requested_ip,

8806 tag=tag,

8807 )

8808 ]

8809 )

8810

8811 if len(requested_networks) != 1: 8811 ↛ 8812line 8811 didn't jump to line 8812 because the condition on line 8811 was never true

8812 LOG.warning(

8813 "Interface attach only supports one interface per attach "

8814 "request", instance=instance)

8815 raise exception.InterfaceAttachFailed(instance_uuid=instance.uuid)

8816

8817 pci_numa_affinity_policy = hardware.get_pci_numa_policy_constraint(

8818 instance.flavor, instance.image_meta)

8819 pci_reqs = objects.InstancePCIRequests(

8820 requests=[], instance_uuid=instance.uuid)

8821 _, request_groups, req_lvl_params = (

8822 self.network_api.create_resource_requests(

8823 context,

8824 requested_networks,

8825 pci_reqs,

8826 affinity_policy=pci_numa_affinity_policy

8827 )

8828 )

8829

8830 result = self._allocate_port_resource_for_instance(

8831 context, instance, pci_reqs, request_groups, req_lvl_params)

8832 provider_mappings, resources = result

8833

8834 try:

8835 pci_device = self._claim_pci_device_for_interface_attach(

8836 context, instance, pci_reqs)

8837 except exception.InterfaceAttachPciClaimFailed:

8838 with excutils.save_and_reraise_exception():

8839 if resources: 8839 ↛ 8845line 8839 didn't jump to line 8845

8840 # TODO(gibi): Instead of giving up we could try another

8841 # allocation candidate from _allocate_resources() if any

8842 self._deallocate_port_resource_for_instance(

8843 context, instance, port_id, resources)

8844

8845 instance.pci_requests.requests.extend(pci_reqs.requests)

8846

8847 network_info = self.network_api.allocate_for_instance(

8848 context,

8849 instance,

8850 requested_networks,

8851 bind_host_id=bind_host_id,

8852 resource_provider_mapping=provider_mappings,

8853 )

8854

8855 if len(network_info) != 1: 8855 ↛ 8856line 8855 didn't jump to line 8856 because the condition on line 8855 was never true

8856 LOG.error('allocate_for_instance returned %(ports)s '

8857 'ports', {'ports': len(network_info)})

8858 # TODO(elod.illes): an instance.interface_attach.error notification

8859 # should be sent here

8860 raise exception.InterfaceAttachFailed(

8861 instance_uuid=instance.uuid)

8862 image_meta = objects.ImageMeta.from_instance(instance)

8863

8864 try:

8865 self.driver.attach_interface(context, instance, image_meta,

8866 network_info[0])

8867 except exception.NovaException as ex:

8868 port_id = network_info[0].get('id')

8869 LOG.warning("attach interface failed , try to deallocate "

8870 "port %(port_id)s, reason: %(msg)s",

8871 {'port_id': port_id, 'msg': ex},

8872 instance=instance)

8873 self._deallocate_port_for_instance(

8874 context, instance, port_id, pci_device=pci_device)

8875

8876 compute_utils.notify_about_instance_action(

8877 context, instance, self.host,

8878 action=fields.NotificationAction.INTERFACE_ATTACH,

8879 phase=fields.NotificationPhase.ERROR,

8880 exception=ex)

8881

8882 raise exception.InterfaceAttachFailed(

8883 instance_uuid=instance.uuid)

8884

8885 if pci_device:

8886 # NOTE(gibi): The _claim_pci_device_for_interface_attach() call

8887 # found a pci device but it only marked the device as claimed. The

8888 # periodic update_available_resource would move the device to

8889 # allocated state. But as driver.attach_interface() has been

8890 # succeeded we now know that the interface is also allocated

8891 # (used by) to the instance. So make sure the pci tracker also

8892 # tracks this device as allocated. This way we can avoid a possible

8893 # race condition when a detach arrives for a device that is only

8894 # in claimed state.

8895 self.rt.allocate_pci_devices_for_instance(context, instance)

8896

8897 instance.save()

8898

8899 compute_utils.notify_about_instance_action(

8900 context, instance, self.host,

8901 action=fields.NotificationAction.INTERFACE_ATTACH,

8902 phase=fields.NotificationPhase.END)

8903

8904 return network_info[0]

8905

8906 @wrap_exception()

8907 @wrap_instance_event(prefix='compute')

8908 @wrap_instance_fault

8909 def detach_interface(self, context, instance, port_id):

8910 """Detach a network adapter from an instance."""

8911 lockname = 'interface-%s-%s' % (instance.uuid, port_id)

8912

8913 @utils.synchronized(lockname)

8914 def do_detach_interface(context, instance, port_id):

8915 self._detach_interface(context, instance, port_id)

8916

8917 do_detach_interface(context, instance, port_id)

8918

8919 def _detach_interface(self, context, instance, port_id):

8920 # NOTE(aarents): we need to refresh info cache from DB here,

8921 # as previous detach/attach lock holder just updated it.

8922 compute_utils.refresh_info_cache_for_instance(context, instance)

8923 network_info = instance.info_cache.network_info

8924 condemned = None

8925 for vif in network_info:

8926 if vif['id'] == port_id: 8926 ↛ 8925line 8926 didn't jump to line 8925 because the condition on line 8926 was always true

8927 condemned = vif

8928 break

8929 if condemned is None:

8930 raise exception.PortNotFound(_("Port %s is not "

8931 "attached") % port_id)

8932

8933 pci_req = pci_req_module.get_instance_pci_request_from_vif(

8934 context, instance, condemned)

8935

8936 pci_device = None

8937 if pci_req:

8938 pci_devices = [pci_device

8939 for pci_device in instance.pci_devices.objects

8940 if pci_device.request_id == pci_req.request_id]

8941

8942 if not pci_devices:

8943 LOG.warning(

8944 "Detach interface failed, port_id=%(port_id)s, "

8945 "reason: PCI device not found for PCI request %(pci_req)s",

8946 {'port_id': port_id, 'pci_req': pci_req})

8947 raise exception.InterfaceDetachFailed(

8948 instance_uuid=instance.uuid)

8949

8950 pci_device = pci_devices[0]

8951

8952 compute_utils.notify_about_instance_action(

8953 context, instance, self.host,

8954 action=fields.NotificationAction.INTERFACE_DETACH,

8955 phase=fields.NotificationPhase.START)

8956

8957 try:

8958 self.driver.detach_interface(context, instance, condemned)

8959 except exception.NovaException as ex:

8960 # If the instance was deleted before the interface was detached,

8961 # just log it at debug.

8962 log_level = (logging.DEBUG

8963 if isinstance(ex, exception.InstanceNotFound)

8964 else logging.WARNING)

8965 LOG.log(log_level,

8966 "Detach interface failed, port_id=%(port_id)s, reason: "

8967 "%(msg)s", {'port_id': port_id, 'msg': ex},

8968 instance=instance)

8969 raise exception.InterfaceDetachFailed(instance_uuid=instance.uuid)

8970 else:

8971 self._deallocate_port_for_instance(

8972 context, instance, port_id, raise_on_failure=True,

8973 pci_device=pci_device)

8974

8975 instance.save()

8976

8977 compute_utils.notify_about_instance_action(

8978 context, instance, self.host,

8979 action=fields.NotificationAction.INTERFACE_DETACH,

8980 phase=fields.NotificationPhase.END)

8981

8982 def _get_compute_info(self, context, host):

8983 return objects.ComputeNode.get_first_node_by_host_for_old_compat(

8984 context, host)

8985

8986 @wrap_exception()

8987 def check_instance_shared_storage(self, ctxt, data):

8988 """Check if the instance files are shared

8989

8990 :param ctxt: security context

8991 :param data: result of driver.check_instance_shared_storage_local

8992

8993 Returns True if instance disks located on shared storage and

8994 False otherwise.

8995 """

8996 return self.driver.check_instance_shared_storage_remote(ctxt, data)

8997

8998 def _dest_can_numa_live_migrate(self, dest_check_data, migration):

8999 # TODO(artom) If we have a libvirt driver we expect it to set

9000 # dst_supports_numa_live_migration, but we have to remove it if we

9001 # did not get a migration from the conductor, indicating that it

9002 # cannot send RPC 5.3. This check can be removed in RPC 6.0.

9003 if ('dst_supports_numa_live_migration' in dest_check_data and

9004 dest_check_data.dst_supports_numa_live_migration and

9005 not migration):

9006 delattr(dest_check_data, 'dst_supports_numa_live_migration')

9007 return dest_check_data

9008

9009 @wrap_exception()

9010 @wrap_instance_event(prefix='compute')

9011 @wrap_instance_fault

9012 def check_can_live_migrate_destination(self, ctxt, instance,

9013 block_migration, disk_over_commit,

9014 migration, limits):

9015 """Check if it is possible to execute live migration.

9016

9017 This runs checks on the destination host, and then calls

9018 back to the source host to check the results.

9019

9020 :param context: security context

9021 :param instance: dict of instance data

9022 :param block_migration: if true, prepare for block migration

9023 if None, calculate it in driver

9024 :param disk_over_commit: if true, allow disk over commit

9025 if None, ignore disk usage checking

9026 :param migration: objects.Migration object for this live migration.

9027 :param limits: objects.SchedulerLimits object for this live migration.

9028 :returns: a LiveMigrateData object (hypervisor-dependent)

9029 """

9030

9031 # Error out if this host cannot accept the new instance due

9032 # to anti-affinity. This check at this moment is not very accurate, as

9033 # multiple requests may be happening concurrently and miss the lock,

9034 # but when it works it provides a better user experience by failing

9035 # earlier. Also, it should be safe to explode here, error becomes

9036 # NoValidHost and instance status remains ACTIVE.

9037 try:

9038 self._validate_instance_group_policy(ctxt, instance)

9039 except exception.RescheduledException as e:

9040 msg = ("Failed to validate instance group policy "

9041 "due to: {}".format(e))

9042 raise exception.MigrationPreCheckError(reason=msg)

9043

9044 src_compute_info = obj_base.obj_to_primitive(

9045 self._get_compute_info(ctxt, instance.host))

9046 dst_compute_info = obj_base.obj_to_primitive(

9047 self._get_compute_info(ctxt, self.host))

9048 dest_check_data = self.driver.check_can_live_migrate_destination(ctxt,

9049 instance, src_compute_info, dst_compute_info,

9050 block_migration, disk_over_commit)

9051 dest_check_data = self._dest_can_numa_live_migrate(dest_check_data,

9052 migration)

9053 LOG.debug('destination check data is %s', dest_check_data)

9054 try:

9055 allocs = self.reportclient.get_allocations_for_consumer(

9056 ctxt, instance.uuid)

9057 migrate_data = self.compute_rpcapi.check_can_live_migrate_source(

9058 ctxt, instance, dest_check_data)

9059 if ('src_supports_numa_live_migration' in migrate_data and

9060 migrate_data.src_supports_numa_live_migration):

9061 migrate_data = self._live_migration_claim(

9062 ctxt, instance, migrate_data, migration, limits, allocs)

9063 elif 'dst_supports_numa_live_migration' in dest_check_data: 9063 ↛ 9073line 9063 didn't jump to line 9073 because the condition on line 9063 was always true

9064 LOG.info('Destination was ready for NUMA live migration, '

9065 'but source is either too old, or is set to an '

9066 'older upgrade level.', instance=instance)

9067

9068 # At this point, we know that this compute node (destination)

9069 # potentially has enough live-migratable PCI devices, based on the

9070 # fact that the scheduler selected this host as the destination.

9071 # The claim code below is the decisive step to move from a

9072 # potentially correct to a known to be correct destination.

9073 flavored_pci_reqs = [

9074 pci_req

9075 for pci_req in instance.pci_requests.requests

9076 if pci_req.source == objects.InstancePCIRequest.FLAVOR_ALIAS

9077 ]

9078

9079 migrate_data.pci_dev_map_src_dst = self._flavor_based_pci_claim(

9080 ctxt, instance, flavored_pci_reqs

9081 )

9082

9083 if self.network_api.has_port_binding_extension(ctxt):

9084 # Create migrate_data vifs if not provided by driver.

9085 if 'vifs' not in migrate_data: 9085 ↛ 9091line 9085 didn't jump to line 9091 because the condition on line 9085 was always true

9086 migrate_data.vifs = (

9087 migrate_data_obj.

9088 VIFMigrateData.create_skeleton_migrate_vifs(

9089 instance.get_network_info()))

9090 # Claim PCI devices for VIFs on destination (if needed)

9091 port_id_to_pci = self._claim_pci_for_instance_vifs(

9092 ctxt, instance)

9093 # Update migrate VIFs with the newly claimed PCI devices

9094 self._update_migrate_vifs_profile_with_pci(

9095 migrate_data.vifs, port_id_to_pci)

9096 # This should always be the last check as we persist some internal

9097 # dictionary value in the libvirt driver.

9098 migrate_data = self.driver.check_source_migrate_data_at_dest(

9099 ctxt, instance, migrate_data, migration, limits, allocs)

9100 finally:

9101 self.driver.cleanup_live_migration_destination_check(ctxt,

9102 dest_check_data)

9103 return migrate_data

9104

9105 def _flavor_based_pci_claim(self, ctxt, instance, pci_requests):

9106 if not pci_requests:

9107 # Return an empty dict as migrate_data.pci_dev_map_src_dst

9108 # cannot be None

9109 return {}

9110

9111 # Create an InstancePCIRequests and claim against PCI resource

9112 # tracker for out dest instance

9113 pci_requests = objects.InstancePCIRequests(

9114 requests=pci_requests,

9115 instance_uuid=instance.uuid)

9116

9117 src_devs = objects.PciDeviceList.get_by_instance_uuid(

9118 ctxt, instance.uuid

9119 )

9120

9121 claimed_dst_devs = self._claim_from_pci_reqs(

9122 ctxt, instance, pci_requests

9123 )

9124

9125 pci_dev_map_src_dst = {}

9126 for req_id in (

9127 pci_req.request_id for pci_req in pci_requests.requests

9128 ):

9129 req_src_addr = [

9130 dev.address

9131 for dev in src_devs

9132 if dev.request_id == req_id

9133 ]

9134 req_claimed_dst__addr = [

9135 dev.address

9136 for dev in claimed_dst_devs

9137 if dev.request_id == req_id

9138 ]

9139

9140 # This still depends on ordering, but only within the scope

9141 # of the same InstancePCIRequest. The only case where multiple

9142 # devices per compute node are allocated for a single request

9143 # is when req.count > 1. In that case, the allocated devices are

9144 # interchangeable since they match the same specification from

9145 # the same InstancePCIRequest.

9146 #

9147 # Therefore, this ordering dependency is acceptable.

9148 pci_dev_map_src_dst.update(dict(

9149 zip(req_src_addr, req_claimed_dst__addr)

9150 ))

9151

9152 return pci_dev_map_src_dst

9153

9154 def _claim_from_pci_reqs(self, ctxt, instance, pci_requests):

9155 # if we are called during the live migration with NUMA topology

9156 # support the PCI claim needs to consider the destination NUMA

9157 # topology that is then stored in the migration_context

9158 dest_topo = None

9159 if instance.migration_context: 9159 ↛ 9162line 9159 didn't jump to line 9162 because the condition on line 9159 was always true

9160 dest_topo = instance.migration_context.new_numa_topology

9161

9162 claimed_pci_devices_objs = self.rt.claim_pci_devices(

9163 ctxt, pci_requests, dest_topo)

9164

9165 for pci_dev in claimed_pci_devices_objs:

9166 LOG.debug("PCI device: %s Claimed on destination node",

9167 pci_dev.address)

9168 return claimed_pci_devices_objs

9169

9170 def _live_migration_claim(self, ctxt, instance, migrate_data,

9171 migration, limits, allocs):

9172 """Runs on the destination and does a resources claim, if necessary.

9173 Currently, only NUMA live migrations require it.

9174

9175 :param ctxt: Request context

9176 :param instance: The Instance being live migrated

9177 :param migrate_data: The MigrateData object for this live migration

9178 :param migration: The Migration object for this live migration

9179 :param limits: The SchedulerLimits object for this live migration

9180 :returns: migrate_data with dst_numa_info set if necessary

9181 """

9182 try:

9183 # NOTE(artom) We might have gotten here from _find_destination() in

9184 # the conductor live migrate task. At that point,

9185 # migration.dest_node is not set yet (nor should it be, we're still

9186 # looking for a destination, after all). Therefore, we cannot use

9187 # migration.dest_node here and must use self._get_nodename().

9188 claim = self.rt.live_migration_claim(

9189 ctxt, instance, self._get_nodename(instance), migration,

9190 limits, allocs)

9191 LOG.debug('Created live migration claim.', instance=instance)

9192 except exception.ComputeResourcesUnavailable as e:

9193 raise exception.MigrationPreCheckError(

9194 reason=e.format_message())

9195 return self.driver.post_claim_migrate_data(ctxt, instance,

9196 migrate_data, claim)

9197

9198 def _source_can_numa_live_migrate(self, ctxt, dest_check_data,

9199 source_check_data):

9200 # TODO(artom) Our virt driver may have told us that it supports NUMA

9201 # live migration. However, the following other conditions must be met

9202 # for a NUMA live migration to happen:

9203 # 1. We got a True dst_supports_numa_live_migration in

9204 # dest_check_data, indicating that the dest virt driver supports

9205 # NUMA live migration and that the conductor can send RPC 5.3 and

9206 # that the destination compute manager can receive it.

9207 # 2. Ourselves, the source, can send RPC 5.3. There's no

9208 # sentinel/parameter for this, so we just ask our rpcapi directly.

9209 # If any of these are not met, we need to remove the

9210 # src_supports_numa_live_migration flag from source_check_data to avoid

9211 # incorrectly initiating a NUMA live migration.

9212 # All of this can be removed in RPC 6.0/objects 2.0.

9213 can_numa_live_migrate = (

9214 'dst_supports_numa_live_migration' in dest_check_data and

9215 dest_check_data.dst_supports_numa_live_migration and

9216 self.compute_rpcapi.supports_numa_live_migration(ctxt))

9217 if ('src_supports_numa_live_migration' in source_check_data and

9218 source_check_data.src_supports_numa_live_migration and

9219 not can_numa_live_migrate):

9220 delattr(source_check_data, 'src_supports_numa_live_migration')

9221 return source_check_data

9222

9223 @wrap_exception()

9224 @wrap_instance_event(prefix='compute')

9225 @wrap_instance_fault

9226 def check_can_live_migrate_source(self, ctxt, instance, dest_check_data):

9227 """Check if it is possible to execute live migration.

9228

9229 This checks if the live migration can succeed, based on the

9230 results from check_can_live_migrate_destination.

9231

9232 :param ctxt: security context

9233 :param instance: dict of instance data

9234 :param dest_check_data: result of check_can_live_migrate_destination

9235 :returns: a LiveMigrateData object

9236 """

9237 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

9238 ctxt, instance.uuid)

9239 is_volume_backed = compute_utils.is_volume_backed_instance(

9240 ctxt, instance, bdms)

9241 dest_check_data.is_volume_backed = is_volume_backed

9242 block_device_info = self._get_instance_block_device_info(

9243 ctxt, instance, refresh_conn_info=False, bdms=bdms)

9244 result = self.driver.check_can_live_migrate_source(ctxt, instance,

9245 dest_check_data,

9246 block_device_info)

9247 result = self._source_can_numa_live_migrate(ctxt, dest_check_data,

9248 result)

9249 LOG.debug('source check data is %s', result)

9250 return result

9251

9252 @wrap_exception()

9253 @wrap_instance_event(prefix='compute')

9254 @wrap_instance_fault

9255 def pre_live_migration(self, context, instance, disk, migrate_data):

9256 """Preparations for live migration at dest host.

9257

9258 :param context: security context

9259 :param instance: dict of instance data

9260 :param disk: disk info of instance

9261 :param migrate_data: A dict or LiveMigrateData object holding data

9262 required for live migration without shared

9263 storage.

9264 :returns: migrate_data containing additional migration info

9265 """

9266 LOG.debug('pre_live_migration data is %s', migrate_data)

9267

9268 # Error out if this host cannot accept the new instance due

9269 # to anti-affinity. At this point the migration is already in-progress,

9270 # so this is the definitive moment to abort due to the policy

9271 # violation. Also, it should be safe to explode here. The instance

9272 # status remains ACTIVE, migration status failed.

9273 self._validate_instance_group_policy(context, instance)

9274

9275 migrate_data.old_vol_attachment_ids = {}

9276 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

9277 context, instance.uuid)

9278 network_info = self.network_api.get_instance_nw_info(context, instance)

9279 self._notify_about_instance_usage(

9280 context, instance, "live_migration.pre.start",

9281 network_info=network_info)

9282 compute_utils.notify_about_instance_action(

9283 context, instance, self.host,

9284 action=fields.NotificationAction.LIVE_MIGRATION_PRE,

9285 phase=fields.NotificationPhase.START, bdms=bdms)

9286

9287 connector = None

9288 try:

9289 for bdm in bdms:

9290 if bdm.is_volume and bdm.attachment_id is not None:

9291 # This bdm uses the new cinder v3.44 API.

9292 # We will create a new attachment for this

9293 # volume on this migration destination host. The old

9294 # attachment will be deleted on the source host

9295 # when the migration succeeds. The old attachment_id

9296 # is stored in dict with the key being the bdm.volume_id

9297 # so it can be restored on rollback.

9298 #

9299 # Also note that attachment_update is not needed as we

9300 # are providing the connector in the create call.

9301 if connector is None:

9302 connector = self.driver.get_volume_connector(instance)

9303 attach_ref = self.volume_api.attachment_create(

9304 context, bdm.volume_id, bdm.instance_uuid,

9305 connector=connector, mountpoint=bdm.device_name)

9306

9307 # save current attachment so we can detach it on success,

9308 # or restore it on a rollback.

9309 # NOTE(mdbooth): This data is no longer used by the source

9310 # host since change Ibe9215c0. We can't remove it until we

9311 # are sure the source host has been upgraded.

9312 migrate_data.old_vol_attachment_ids[bdm.volume_id] = \

9313 bdm.attachment_id

9314

9315 # update the bdm with the new attachment_id.

9316 bdm.attachment_id = attach_ref['id']

9317 bdm.save()

9318

9319 block_device_info = self._get_instance_block_device_info(

9320 context, instance, refresh_conn_info=True,

9321 bdms=bdms)

9322

9323 # The driver pre_live_migration will plug vifs on the host

9324 migrate_data = self.driver.pre_live_migration(context,

9325 instance,

9326 block_device_info,

9327 network_info,

9328 disk,

9329 migrate_data)

9330 LOG.debug('driver pre_live_migration data is %s', migrate_data)

9331 # driver.pre_live_migration is what plugs vifs on the destination

9332 # host so now we can set the wait_for_vif_plugged flag in the

9333 # migrate_data object which the source compute will use to

9334 # determine if it should wait for a 'network-vif-plugged' event

9335 # from neutron before starting the actual guest transfer in the

9336 # hypervisor

9337 using_multiple_port_bindings = (

9338 'vifs' in migrate_data and migrate_data.vifs)

9339 migrate_data.wait_for_vif_plugged = (

9340 CONF.compute.live_migration_wait_for_vif_plug and

9341 using_multiple_port_bindings

9342 )

9343

9344 # NOTE(tr3buchet): setup networks on destination host

9345 self.network_api.setup_networks_on_host(context, instance,

9346 self.host)

9347

9348 # NOTE(lyarwood): The above call to driver.pre_live_migration

9349 # can result in the virt drivers attempting to stash additional

9350 # metadata into the connection_info of the underlying bdm.

9351 # Ensure this is saved to the database by calling .save() against

9352 # the driver BDMs we passed down via block_device_info.

9353 for driver_bdm in block_device_info['block_device_mapping']:

9354 driver_bdm.save()

9355

9356 except Exception:

9357 # If we raise, migrate_data with the updated attachment ids

9358 # will not be returned to the source host for rollback.

9359 # So we need to rollback new attachments here.

9360 with excutils.save_and_reraise_exception():

9361 old_attachments = migrate_data.old_vol_attachment_ids

9362 for bdm in bdms:

9363 if (bdm.is_volume and bdm.attachment_id is not None and

9364 bdm.volume_id in old_attachments):

9365 self.volume_api.attachment_delete(context,

9366 bdm.attachment_id)

9367 bdm.attachment_id = old_attachments[bdm.volume_id]

9368 bdm.save()

9369

9370 # Volume connections are complete, tell cinder that all the

9371 # attachments have completed.

9372 for bdm in bdms:

9373 if bdm.is_volume and bdm.attachment_id is not None:

9374 self.volume_api.attachment_complete(context,

9375 bdm.attachment_id)

9376

9377 self._notify_about_instance_usage(

9378 context, instance, "live_migration.pre.end",

9379 network_info=network_info)

9380 compute_utils.notify_about_instance_action(

9381 context, instance, self.host,

9382 action=fields.NotificationAction.LIVE_MIGRATION_PRE,

9383 phase=fields.NotificationPhase.END, bdms=bdms)

9384

9385 LOG.debug('pre_live_migration result data is %s', migrate_data)

9386 return migrate_data

9387

9388 @staticmethod

9389 def _neutron_failed_live_migration_callback(event_name, instance):

9390 msg = ('Neutron reported failure during live migration '

9391 'with %(event)s for instance %(uuid)s')

9392 msg_args = {'event': event_name, 'uuid': instance.uuid}

9393 if CONF.vif_plugging_is_fatal:

9394 raise exception.VirtualInterfacePlugException(msg % msg_args)

9395 LOG.error(msg, msg_args)

9396

9397 @staticmethod

9398 def _get_neutron_events_for_live_migration(instance):

9399 # We don't generate events if CONF.vif_plugging_timeout=0

9400 # meaning that the operator disabled using them.

9401 if CONF.vif_plugging_timeout:

9402 return (instance.get_network_info()

9403 .get_live_migration_plug_time_events())

9404 else:

9405 return []

9406

9407 def _cleanup_pre_live_migration(self, context, dest, instance,

9408 migration, migrate_data, source_bdms):

9409 """Helper method for when pre_live_migration fails

9410

9411 Sets the migration status to "error" and rolls back the live migration

9412 setup on the destination host.

9413

9414 :param context: The user request context.

9415 :type context: nova.context.RequestContext

9416 :param dest: The live migration destination hostname.

9417 :type dest: str

9418 :param instance: The instance being live migrated.

9419 :type instance: nova.objects.Instance

9420 :param migration: The migration record tracking this live migration.

9421 :type migration: nova.objects.Migration

9422 :param migrate_data: Data about the live migration, populated from

9423 the destination host.

9424 :type migrate_data: Subclass of nova.objects.LiveMigrateData

9425 :param source_bdms: BDMs prior to modification by the destination

9426 compute host. Set by _do_live_migration and not

9427 part of the callback interface, so this is never

9428 None

9429 """

9430 self._set_migration_status(migration, 'error')

9431 # Make sure we set this for _rollback_live_migration()

9432 # so it can find it, as expected if it was called later

9433 migrate_data.migration = migration

9434 self._rollback_live_migration(context, instance, dest,

9435 migrate_data=migrate_data,

9436 source_bdms=source_bdms,

9437 pre_live_migration=True)

9438

9439 def _do_pre_live_migration_from_source(self, context, dest, instance,

9440 block_migration, migration,

9441 migrate_data, source_bdms):

9442 """Prepares for pre-live-migration on the source host and calls dest

9443

9444 Will setup a callback networking event handler (if configured) and

9445 then call the dest host's pre_live_migration method to prepare the

9446 dest host for live migration (plugs vifs, connect volumes, etc).

9447

9448 _rollback_live_migration (on the source) will be called if

9449 pre_live_migration (on the dest) fails.

9450

9451 :param context: nova auth request context for this operation

9452 :param dest: name of the destination compute service host

9453 :param instance: Instance object being live migrated

9454 :param block_migration: If true, prepare for block migration.

9455 :param migration: Migration object tracking this operation

9456 :param migrate_data: MigrateData object for this operation populated

9457 by the destination host compute driver as part of the

9458 check_can_live_migrate_destination call.

9459 :param source_bdms: BlockDeviceMappingList of BDMs currently attached

9460 to the instance from the source host.

9461 :returns: MigrateData object which is a modified version of the

9462 ``migrate_data`` argument from the compute driver on the dest

9463 host during the ``pre_live_migration`` call.

9464 :raises: MigrationError if waiting for the network-vif-plugged event

9465 timed out and is fatal.

9466 """

9467 class _BreakWaitForInstanceEvent(Exception):

9468 """Used as a signal to stop waiting for the network-vif-plugged

9469 event when we discover that

9470 [compute]/live_migration_wait_for_vif_plug is not set on the

9471 destination.

9472 """

9473 pass

9474

9475 events = self._get_neutron_events_for_live_migration(instance)

9476 try:

9477 if ('block_migration' in migrate_data and

9478 migrate_data.block_migration):

9479 block_device_info = self._get_instance_block_device_info(

9480 context, instance, bdms=source_bdms)

9481 disk = self.driver.get_instance_disk_info(

9482 instance, block_device_info=block_device_info)

9483 else:

9484 disk = None

9485

9486 deadline = CONF.vif_plugging_timeout

9487 error_cb = self._neutron_failed_live_migration_callback

9488 # In order to avoid a race with the vif plugging that the virt

9489 # driver does on the destination host, we register our events

9490 # to wait for before calling pre_live_migration. Then if the

9491 # dest host reports back that we shouldn't wait, we can break

9492 # out of the context manager using _BreakWaitForInstanceEvent.

9493 with self.virtapi.wait_for_instance_event(

9494 instance, events, deadline=deadline,

9495 error_callback=error_cb):

9496 with timeutils.StopWatch() as timer:

9497 # TODO(mriedem): The "block_migration" parameter passed

9498 # here is not actually used in pre_live_migration but it

9499 # is not optional in the RPC interface either.

9500 migrate_data = self.compute_rpcapi.pre_live_migration(

9501 context, instance,

9502 block_migration, disk, dest, migrate_data)

9503 LOG.info('Took %0.2f seconds for pre_live_migration on '

9504 'destination host %s.',

9505 timer.elapsed(), dest, instance=instance)

9506 wait_for_vif_plugged = (

9507 'wait_for_vif_plugged' in migrate_data and

9508 migrate_data.wait_for_vif_plugged)

9509 if events and not wait_for_vif_plugged:

9510 raise _BreakWaitForInstanceEvent

9511 except _BreakWaitForInstanceEvent:

9512 if events: 9512 ↛ 9548line 9512 didn't jump to line 9548 because the condition on line 9512 was always true

9513 LOG.debug('Not waiting for events after pre_live_migration: '

9514 '%s. ', events, instance=instance)

9515 except exception.VirtualInterfacePlugException:

9516 with excutils.save_and_reraise_exception():

9517 LOG.exception('Failed waiting for network virtual interfaces '

9518 'to be plugged on the destination host %s.',

9519 dest, instance=instance)

9520 self._cleanup_pre_live_migration(

9521 context, dest, instance, migration, migrate_data,

9522 source_bdms)

9523 except eventlet.timeout.Timeout:

9524 # We only get here if wait_for_vif_plugged is True which means

9525 # live_migration_wait_for_vif_plug=True on the destination host.

9526 msg = (

9527 'Timed out waiting for events: %(events)s. If these timeouts '

9528 'are a persistent issue it could mean the networking backend '

9529 'on host %(dest)s does not support sending these events '

9530 'unless there are port binding host changes which does not '

9531 'happen at this point in the live migration process. You may '

9532 'need to disable the live_migration_wait_for_vif_plug option '

9533 'on host %(dest)s.')

9534 subs = {'events': events, 'dest': dest}

9535 LOG.warning(msg, subs, instance=instance)

9536 if CONF.vif_plugging_is_fatal:

9537 self._cleanup_pre_live_migration(

9538 context, dest, instance, migration, migrate_data,

9539 source_bdms)

9540 raise exception.MigrationError(reason=msg % subs)

9541 except Exception:

9542 with excutils.save_and_reraise_exception():

9543 LOG.exception('Pre live migration failed at %s',

9544 dest, instance=instance)

9545 self._cleanup_pre_live_migration(

9546 context, dest, instance, migration, migrate_data,

9547 source_bdms)

9548 return migrate_data

9549

9550 def _do_live_migration(self, context, dest, instance, block_migration,

9551 migration, migrate_data):

9552 # NOTE(danms): We should enhance the RT to account for migrations

9553 # and use the status field to denote when the accounting has been

9554 # done on source/destination. For now, this is just here for status

9555 # reporting

9556 self._set_migration_status(migration, 'preparing')

9557 source_bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

9558 context, instance.uuid)

9559

9560 migrate_data = self._do_pre_live_migration_from_source(

9561 context, dest, instance, block_migration, migration, migrate_data,

9562 source_bdms)

9563

9564 # Set migrate_data.migration because that is how _post_live_migration

9565 # and _rollback_live_migration get the migration object for cleanup.

9566 # Yes this is gross but changing the _post_live_migration and

9567 # _rollback_live_migration interfaces would also mean changing how the

9568 # virt drivers call them from the driver.live_migration method, i.e.

9569 # we would have to pass the migration object through the driver (or

9570 # consider using a partial but some do not like that pattern).

9571 migrate_data.migration = migration

9572

9573 # NOTE(Kevin_Zheng): Pop the migration from the waiting queue

9574 # if it exist in the queue, then we are good to moving on, if

9575 # not, some other process must have aborted it, then we should

9576 # rollback.

9577 try:

9578 self._waiting_live_migrations.pop(instance.uuid)

9579 except KeyError:

9580 LOG.debug('Migration %s aborted by another process, rollback.',

9581 migration.uuid, instance=instance)

9582 self._rollback_live_migration(context, instance, dest,

9583 migrate_data, 'cancelled',

9584 source_bdms=source_bdms)

9585 self._notify_live_migrate_abort_end(context, instance)

9586 return

9587

9588 self._set_migration_status(migration, 'running')

9589

9590 # NOTE(mdbooth): pre_live_migration will update connection_info and

9591 # attachment_id on all volume BDMS to reflect the new destination

9592 # host attachment. We fetch BDMs before that to retain connection_info

9593 # and attachment_id relating to the source host for post migration

9594 # cleanup.

9595 post_live_migration = functools.partial(

9596 self._post_live_migration_update_host, source_bdms=source_bdms

9597 )

9598 rollback_live_migration = functools.partial(

9599 self._rollback_live_migration, source_bdms=source_bdms)

9600

9601 LOG.debug('live_migration data is %s', migrate_data)

9602 try:

9603 self.driver.live_migration(context, instance, dest,

9604 post_live_migration,

9605 rollback_live_migration,

9606 block_migration, migrate_data)

9607 except Exception:

9608 LOG.exception('Live migration failed.', instance=instance)

9609 with excutils.save_and_reraise_exception():

9610 # Put instance and migration into error state,

9611 # as its almost certainly too late to rollback

9612 self._set_migration_status(migration, 'error')

9613 # first refresh instance as it may have got updated by

9614 # post_live_migration_at_destination

9615 instance.refresh()

9616 self._set_instance_obj_error_state(instance,

9617 clean_task_state=True)

9618

9619 @wrap_exception()

9620 @wrap_instance_event(prefix='compute')

9621 @errors_out_migration

9622 @wrap_instance_fault

9623 def live_migration(self, context, dest, instance, block_migration,

9624 migration, migrate_data):

9625 """Executing live migration.

9626

9627 :param context: security context

9628 :param dest: destination host

9629 :param instance: a nova.objects.instance.Instance object

9630 :param block_migration: if true, prepare for block migration

9631 :param migration: an nova.objects.Migration object

9632 :param migrate_data: implementation specific params

9633

9634 """

9635 self._set_migration_status(migration, 'queued')

9636 # NOTE(Kevin_Zheng): Submit the live_migration job to the pool and

9637 # put the returned Future object into dict mapped with migration.uuid

9638 # in order to be able to track and abort it in the future.

9639 self._waiting_live_migrations[instance.uuid] = (None, None)

9640 try:

9641 future = nova.utils.pass_context(

9642 self._live_migration_executor.submit,

9643 self._do_live_migration, context, dest, instance,

9644 block_migration, migration, migrate_data)

9645 self._waiting_live_migrations[instance.uuid] = (migration, future)

9646 except RuntimeError:

9647 # GreenThreadPoolExecutor.submit will raise RuntimeError if the

9648 # pool is shutdown, which happens in

9649 # _cleanup_live_migrations_in_pool.

9650 LOG.info('Migration %s failed to submit as the compute service '

9651 'is shutting down.', migration.uuid, instance=instance)

9652 raise exception.LiveMigrationNotSubmitted(

9653 migration_uuid=migration.uuid, instance_uuid=instance.uuid)

9654

9655 @wrap_exception()

9656 @wrap_instance_event(prefix='compute')

9657 @wrap_instance_fault

9658 def live_migration_force_complete(self, context, instance):

9659 """Force live migration to complete.

9660

9661 :param context: Security context

9662 :param instance: The instance that is being migrated

9663 """

9664

9665 self._notify_about_instance_usage(

9666 context, instance, 'live.migration.force.complete.start')

9667 compute_utils.notify_about_instance_action(

9668 context, instance, self.host,

9669 action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,

9670 phase=fields.NotificationPhase.START)

9671 self.driver.live_migration_force_complete(instance)

9672 self._notify_about_instance_usage(

9673 context, instance, 'live.migration.force.complete.end')

9674 compute_utils.notify_about_instance_action(

9675 context, instance, self.host,

9676 action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE,

9677 phase=fields.NotificationPhase.END)

9678

9679 def _notify_live_migrate_abort_end(self, context, instance):

9680 self._notify_about_instance_usage(

9681 context, instance, 'live.migration.abort.end')

9682 compute_utils.notify_about_instance_action(

9683 context, instance, self.host,

9684 action=fields.NotificationAction.LIVE_MIGRATION_ABORT,

9685 phase=fields.NotificationPhase.END)

9686

9687 @wrap_exception()

9688 @wrap_instance_event(prefix='compute')

9689 @wrap_instance_fault

9690 def live_migration_abort(self, context, instance, migration_id):

9691 """Abort an in-progress live migration.

9692

9693 :param context: Security context

9694 :param instance: The instance that is being migrated

9695 :param migration_id: ID of in-progress live migration

9696

9697 """

9698 self._notify_about_instance_usage(

9699 context, instance, 'live.migration.abort.start')

9700 compute_utils.notify_about_instance_action(

9701 context, instance, self.host,

9702 action=fields.NotificationAction.LIVE_MIGRATION_ABORT,

9703 phase=fields.NotificationPhase.START)

9704 # NOTE(Kevin_Zheng): Pop the migration out from the queue, this might

9705 # lead to 3 scenarios:

9706 # 1. The selected migration is still in queue, and the future.cancel()

9707 # succeed, then the abort action is succeed, mark the migration

9708 # status to 'cancelled'.

9709 # 2. The selected migration is still in queue, but the future.cancel()

9710 # failed, then the _do_live_migration() has started executing, and

9711 # the migration status is 'preparing', then we just pop it from the

9712 # queue, and the migration process will handle it later. And the

9713 # migration status couldn't be 'running' in this scenario because

9714 # if _do_live_migration has started executing and we've already

9715 # popped it from the queue and set the migration status to

9716 # 'running' at this point, popping it here will raise KeyError at

9717 # which point we check if it's running and if so, we abort the old

9718 # way.

9719 # 3. The selected migration is not in the queue, then the migration

9720 # status is 'running', let the driver handle it.

9721 try:

9722 migration, future = (

9723 self._waiting_live_migrations.pop(instance.uuid))

9724 if future and future.cancel(): 9724 ↛ 9768line 9724 didn't jump to line 9768 because the condition on line 9724 was always true

9725 # If we got here, we've successfully dropped a queued

9726 # migration from the queue, so _do_live_migration won't run

9727 # and we only need to revert minor changes introduced by Nova

9728 # control plane (port bindings, resource allocations and

9729 # instance's PCI devices), restore VM's state, set the

9730 # migration's status to cancelled and send the notification.

9731 # If Future.cancel() fails, it means _do_live_migration is

9732 # running and the migration status is preparing, and

9733 # _do_live_migration() itself will attempt to pop the queued

9734 # migration, hit a KeyError, and rollback, set the migration

9735 # to cancelled and send the live.migration.abort.end

9736 # notification.

9737 self._revert_allocation(context, instance, migration)

9738 try:

9739 # This call will delete any inactive destination host

9740 # port bindings.

9741 self.network_api.setup_networks_on_host(

9742 context, instance, host=migration.dest_compute,

9743 teardown=True)

9744 except exception.PortBindingDeletionFailed as e:

9745 # Removing the inactive port bindings from the destination

9746 # host is not critical so just log an error but don't fail.

9747 LOG.error(

9748 'Network cleanup failed for destination host %s '

9749 'during live migration rollback. You may need to '

9750 'manually clean up resources in the network service. '

9751 'Error: %s', migration.dest_compute, str(e))

9752 except Exception:

9753 with excutils.save_and_reraise_exception():

9754 LOG.exception(

9755 'An error occurred while cleaning up networking '

9756 'during live migration rollback.',

9757 instance=instance)

9758 instance.task_state = None

9759 instance.save(expected_task_state=[task_states.MIGRATING])

9760 self._set_migration_status(migration, 'cancelled')

9761 except KeyError:

9762 migration = objects.Migration.get_by_id(context, migration_id)

9763 if migration.status != 'running':

9764 raise exception.InvalidMigrationState(

9765 migration_id=migration_id, instance_uuid=instance.uuid,

9766 state=migration.status, method='abort live migration')

9767 self.driver.live_migration_abort(instance)

9768 self._notify_live_migrate_abort_end(context, instance)

9769

9770 def _live_migration_cleanup_flags(self, migrate_data, migr_ctxt=None):

9771 """Determine whether disks, instance path or other resources

9772 need to be cleaned up after live migration (at source on success,

9773 at destination on rollback)

9774

9775 Block migration needs empty image at destination host before migration

9776 starts, so if any failure occurs, any empty images has to be deleted.

9777

9778 Also Volume backed live migration w/o shared storage needs to delete

9779 newly created instance-xxx dir on the destination as a part of its

9780 rollback process

9781

9782 There may be other resources which need cleanup; currently this is

9783 limited to vPMEM and mdev devices with the libvirt driver.

9784

9785 :param migrate_data: implementation specific data

9786 :param migr_ctxt: specific resources stored in migration_context

9787 :returns: (bool, bool) -- do_cleanup, destroy_disks

9788 """

9789 # NOTE(pkoniszewski): block migration specific params are set inside

9790 # migrate_data objects for drivers that expose block live migration

9791 # information (i.e. Libvirt, HyperV). For other drivers cleanup is not

9792 # needed.

9793 do_cleanup = False

9794 destroy_disks = False

9795 if isinstance(migrate_data, migrate_data_obj.LibvirtLiveMigrateData):

9796 has_vpmem = False

9797 if migr_ctxt and migr_ctxt.old_resources:

9798 for resource in migr_ctxt.old_resources: 9798 ↛ 9804line 9798 didn't jump to line 9804 because the loop on line 9798 didn't complete

9799 if ('metadata' in resource and 9799 ↛ 9798line 9799 didn't jump to line 9798 because the condition on line 9799 was always true

9800 isinstance(resource.metadata,

9801 objects.LibvirtVPMEMDevice)):

9802 has_vpmem = True

9803 break

9804 has_mdevs = 'target_mdevs' in migrate_data

9805 power_management_possible = (

9806 'dst_numa_info' in migrate_data and

9807 migrate_data.dst_numa_info is not None)

9808 # No instance booting at source host, but instance dir

9809 # must be deleted for preparing next block migration

9810 # must be deleted for preparing next live migration w/o shared

9811 # storage

9812 # vpmem must be cleaned

9813 do_cleanup = (not migrate_data.is_shared_instance_path or

9814 has_vpmem or has_mdevs or power_management_possible)

9815 destroy_disks = not (

9816 migrate_data.is_shared_block_storage or

9817 migrate_data.is_shared_instance_path)

9818 elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData):

9819 # NOTE(claudiub): We need to cleanup any zombie Planned VM.

9820 do_cleanup = True

9821 destroy_disks = not migrate_data.is_shared_instance_path

9822

9823 return (do_cleanup, destroy_disks)

9824

9825 def _post_live_migration_remove_source_vol_connections(

9826 self, context, instance, source_bdms):

9827 """Disconnect volume connections from the source host during

9828 _post_live_migration.

9829

9830 :param context: nova auth RequestContext

9831 :param instance: Instance object being live migrated

9832 :param source_bdms: BlockDeviceMappingList representing the attached

9833 volumes with connection_info set for the source host

9834 """

9835 # Detaching volumes.

9836 connector = None

9837 for bdm in source_bdms:

9838 if bdm.is_volume:

9839 # Detaching volumes is a call to an external API that can fail.

9840 # If it does, we need to handle it gracefully so that the call

9841 # to post_live_migration_at_destination - where we set instance

9842 # host and task state - still happens. We need to rethink the

9843 # current approach of setting instance host and task state

9844 # AFTER a whole bunch of things that could fail in unhandled

9845 # ways, but that is left as a TODO(artom).

9846 try:

9847 if bdm.attachment_id is None:

9848 # Prior to cinder v3.44:

9849 # We don't want to actually mark the volume detached,

9850 # or delete the bdm, just remove the connection from

9851 # this host.

9852 #

9853 # remove the volume connection without detaching from

9854 # hypervisor because the instance is not running

9855 # anymore on the current host

9856 if connector is None: 9856 ↛ 9859line 9856 didn't jump to line 9859 because the condition on line 9856 was always true

9857 connector = self.driver.get_volume_connector(

9858 instance)

9859 self.volume_api.terminate_connection(context,

9860 bdm.volume_id,

9861 connector)

9862 else:

9863 # cinder v3.44 api flow - delete the old attachment

9864 # for the source host

9865 self.volume_api.attachment_delete(context,

9866 bdm.attachment_id)

9867

9868 except Exception as e:

9869 if bdm.attachment_id is None:

9870 LOG.error('Connection for volume %s not terminated on '

9871 'source host %s during post_live_migration: '

9872 '%s', bdm.volume_id, self.host,

9873 str(e), instance=instance)

9874 else:

9875 LOG.error('Volume attachment %s not deleted on source '

9876 'host %s during post_live_migration: %s',

9877 bdm.attachment_id, self.host,

9878 str(e), instance=instance)

9879

9880 # TODO(sean-k-mooney): add typing

9881 def _post_live_migration_update_host(

9882 self, ctxt, instance, dest, block_migration=False,

9883 migrate_data=None, source_bdms=None

9884 ):

9885 try:

9886 self._post_live_migration(

9887 ctxt, instance, dest, block_migration, migrate_data,

9888 source_bdms)

9889 except Exception:

9890 # Restore the instance object

9891 compute_node = None

9892 node_name = None

9893 try:

9894 # get node name of compute, where instance will be

9895 # running after migration, that is destination host

9896 compute_node = self._get_compute_info(ctxt, dest)

9897 node_name = compute_node.hypervisor_hostname

9898 except exception.ComputeHostNotFound:

9899 LOG.exception('Failed to get compute_info for %s', dest)

9900

9901 # we can never rollback from post live migration and we can only

9902 # get here if the instance is running on the dest so we ensure

9903 # the instance.host is set correctly and reraise the original

9904 # exception unmodified.

9905 if instance.host != dest: 9905 ↛ 9916line 9905 didn't jump to line 9916 because the condition on line 9905 was always true

9906 # apply saves the new fields while drop actually removes the

9907 # migration context from the instance, so migration persists.

9908 instance.apply_migration_context()

9909 instance.drop_migration_context()

9910 instance.host = dest

9911 instance.task_state = None

9912 instance.node = node_name

9913 instance.compute_id = compute_node and compute_node.id or None

9914 instance.progress = 0

9915 instance.save()

9916 raise

9917

9918 @wrap_exception()

9919 @wrap_instance_fault

9920 def _post_live_migration(self, ctxt, instance, dest,

9921 block_migration=False, migrate_data=None,

9922 source_bdms=None):

9923 """Post operations for live migration.

9924

9925 This method is called from live_migration

9926 and mainly updating database record.

9927

9928 :param ctxt: security context

9929 :param instance: instance object

9930 :param dest: destination host

9931 :param block_migration: if true, prepare for block migration

9932 :param migrate_data: if not None, it is a dict which has data

9933 :param source_bdms: BDMs prior to modification by the destination

9934 compute host. Set by _do_live_migration and not

9935 part of the callback interface, so this is never

9936 None

9937 required for live migration without shared storage

9938

9939 """

9940 LOG.info('_post_live_migration() is started..',

9941 instance=instance)

9942

9943 # NOTE(artom) The ordering and exception handling are important here.

9944 # We want to immediately activate the port bindings on the destination

9945 # host to minimize network downtime. This happens in

9946 # migrate_instance_start(). It's also crucial that we call

9947 # _post_live_migration_remove_source_vol_connections() to clean up

9948 # source volume connections and prevent potential data leaks. We

9949 # therefore activate the port bindings in a try block, and, regardless

9950 # of any exceptions during that process, clean up volume connections in

9951 # a finally block.

9952 try:

9953 # NOTE(artom) At this point in time we have not bound the ports to

9954 # the destination host yet (this happens in

9955 # migrate_instance_start() below). Therefore, the "old" source

9956 # network info that's still in the instance info cache is safe to

9957 # use here, since it'll be used below during

9958 # driver.post_live_migration_at_source() to unplug the VIFs on the

9959 # source.

9960 network_info = instance.get_network_info()

9961

9962 self._notify_about_instance_usage(ctxt, instance,

9963 "live_migration._post.start",

9964 network_info=network_info,

9965 best_effort=True)

9966 compute_utils.notify_about_instance_action(

9967 ctxt, instance, self.host,

9968 action=fields.NotificationAction.LIVE_MIGRATION_POST,

9969 phase=fields.NotificationPhase.START, best_effort=True)

9970

9971 migration = objects.Migration(

9972 source_compute=self.host, dest_compute=dest,

9973 )

9974 # For neutron, migrate_instance_start will activate the destination

9975 # host port bindings, if there are any created by conductor before

9976 # live migration started.

9977 self.network_api.migrate_instance_start(ctxt, instance, migration)

9978 finally:

9979 # Cleanup source host post live-migration

9980 block_device_info = self._get_instance_block_device_info(

9981 ctxt, instance, bdms=source_bdms)

9982 self.driver.post_live_migration(ctxt, instance, block_device_info,

9983 migrate_data)

9984

9985 # Disconnect volumes from this (the source) host.

9986 self._post_live_migration_remove_source_vol_connections(

9987 ctxt, instance, source_bdms)

9988

9989 destroy_vifs = False

9990 try:

9991 # It's possible that the vif type changed on the destination

9992 # host and is already bound and active, so we need to use the

9993 # stashed source vifs in migrate_data.vifs (if present) to unplug

9994 # on the source host.

9995 unplug_nw_info = network_info

9996 if migrate_data and 'vifs' in migrate_data:

9997 nw_info = []

9998 for migrate_vif in migrate_data.vifs:

9999 nw_info.append(migrate_vif.source_vif)

10000 unplug_nw_info = network_model.NetworkInfo.hydrate(nw_info)

10001 LOG.debug('Calling driver.post_live_migration_at_source '

10002 'with original source VIFs from migrate_data: %s',

10003 unplug_nw_info, instance=instance)

10004 self.driver.post_live_migration_at_source(ctxt, instance,

10005 unplug_nw_info)

10006 except NotImplementedError as ex:

10007 LOG.debug(ex, instance=instance)

10008 # For all hypervisors other than libvirt, there is a possibility

10009 # they are unplugging networks from source node in the cleanup

10010 # method

10011 destroy_vifs = True

10012

10013 # Free instance allocations on source before claims are allocated on

10014 # destination node

10015 self.rt.free_pci_device_allocations_for_instance(ctxt, instance)

10016 # NOTE(danms): Save source node before calling post method on

10017 # destination, which will update it

10018 source_node = instance.node

10019

10020 do_cleanup, destroy_disks = self._live_migration_cleanup_flags(

10021 migrate_data, migr_ctxt=instance.migration_context)

10022

10023 if do_cleanup:

10024 LOG.debug('Calling driver.cleanup from _post_live_migration',

10025 instance=instance)

10026 self.driver.cleanup(ctxt, instance, unplug_nw_info,

10027 destroy_disks=destroy_disks,

10028 migrate_data=migrate_data,

10029 destroy_vifs=destroy_vifs)

10030

10031 # Define domain at destination host, without doing it,

10032 # pause/suspend/terminate do not work.

10033 post_at_dest_success = True

10034 try:

10035 self.compute_rpcapi.post_live_migration_at_destination(ctxt,

10036 instance, block_migration, dest)

10037 except Exception as error:

10038 post_at_dest_success = False

10039 # If post_live_migration_at_destination() fails, we now have the

10040 # _post_live_migration_update_host() method that will handle

10041 # this case.

10042 LOG.exception("Post live migration at destination %s failed",

10043 dest, instance=instance, error=error)

10044 raise

10045

10046 self.instance_events.clear_events_for_instance(instance)

10047

10048 # NOTE(timello): make sure we update available resources on source

10049 # host even before next periodic task.

10050 self.update_available_resource(ctxt)

10051

10052 self._update_scheduler_instance_info(ctxt, instance)

10053 self._notify_about_instance_usage(ctxt, instance,

10054 "live_migration._post.end",

10055 network_info=network_info)

10056 compute_utils.notify_about_instance_action(

10057 ctxt, instance, self.host,

10058 action=fields.NotificationAction.LIVE_MIGRATION_POST,

10059 phase=fields.NotificationPhase.END)

10060 if post_at_dest_success: 10060 ↛ 10064line 10060 didn't jump to line 10064 because the condition on line 10060 was always true

10061 LOG.info('Migrating instance to %s finished successfully.',

10062 dest, instance=instance)

10063

10064 self._clean_instance_console_tokens(ctxt, instance)

10065 if migrate_data and migrate_data.obj_attr_is_set('migration'):

10066 migrate_data.migration.status = 'completed'

10067 migrate_data.migration.save()

10068 self._delete_allocation_after_move(ctxt,

10069 instance,

10070 migrate_data.migration)

10071 else:

10072 # We didn't have data on a migration, which means we can't

10073 # look up to see if we had new-style migration-based

10074 # allocations. This should really only happen in cases of

10075 # a buggy virt driver. Log a warning so we know it happened.

10076 LOG.warning('Live migration ended with no migrate_data '

10077 'record. Unable to clean up migration-based '

10078 'allocations for node %s which is almost certainly '

10079 'not an expected situation.', source_node,

10080 instance=instance)

10081

10082 def _consoles_enabled(self):

10083 """Returns whether a console is enable."""

10084 return (CONF.vnc.enabled or CONF.spice.enabled or

10085 CONF.serial_console.enabled or

10086 CONF.mks.enabled)

10087

10088 def _clean_instance_console_tokens(self, ctxt, instance):

10089 """Clean console tokens stored for an instance."""

10090 # If the database backend isn't in use, don't bother trying to clean

10091 # tokens.

10092 if self._consoles_enabled():

10093 objects.ConsoleAuthToken.\

10094 clean_console_auths_for_instance(ctxt, instance.uuid)

10095

10096 @wrap_exception()

10097 @wrap_instance_event(prefix='compute')

10098 @wrap_instance_fault

10099 def post_live_migration_at_destination(self, context, instance,

10100 block_migration):

10101 """Post operations for live migration .

10102

10103 :param context: security context

10104 :param instance: Instance dict

10105 :param block_migration: if true, prepare for block migration

10106

10107 """

10108 LOG.info('Post operation of migration started',

10109 instance=instance)

10110

10111 # NOTE(tr3buchet): setup networks on destination host

10112 # this is called a second time because

10113 # multi_host does not create the bridge in

10114 # plug_vifs

10115 # NOTE(mriedem): This is a no-op for neutron.

10116 self.network_api.setup_networks_on_host(context, instance,

10117 self.host)

10118 migration = objects.Migration(

10119 source_compute=instance.host,

10120 dest_compute=self.host,

10121 migration_type=fields.MigrationType.LIVE_MIGRATION)

10122 self.network_api.migrate_instance_finish(

10123 context, instance, migration, provider_mappings=None)

10124

10125 network_info = self.network_api.get_instance_nw_info(context, instance)

10126 self._notify_about_instance_usage(

10127 context, instance, "live_migration.post.dest.start",

10128 network_info=network_info)

10129 compute_utils.notify_about_instance_action(context, instance,

10130 self.host,

10131 action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST,

10132 phase=fields.NotificationPhase.START)

10133 block_device_info = self._get_instance_block_device_info(context,

10134 instance)

10135 # Allocate the claimed PCI resources at destination.

10136 self.rt.allocate_pci_devices_for_instance(context, instance)

10137

10138 try:

10139 self.driver.post_live_migration_at_destination(

10140 context, instance, network_info, block_migration,

10141 block_device_info)

10142 except Exception:

10143 with excutils.save_and_reraise_exception():

10144 instance.vm_state = vm_states.ERROR

10145 LOG.error('Unexpected error during post live migration at '

10146 'destination host.', instance=instance)

10147 finally:

10148 # Restore instance state and update host

10149 current_power_state = self._get_power_state(instance)

10150 compute_node = None

10151 node_name = None

10152 prev_host = instance.host

10153 try:

10154 compute_node = self._get_compute_info(context, self.host)

10155 node_name = compute_node.hypervisor_hostname

10156 except exception.ComputeHostNotFound:

10157 LOG.exception('Failed to get compute_info for %s', self.host)

10158 finally:

10159 # NOTE(artom) We need to apply the migration context here

10160 # regardless of whether the driver's

10161 # post_live_migration_at_destination succeeded or not: the

10162 # instance is on the destination, potentially with a new NUMA

10163 # topology and resource usage. We need to persist that.

10164 # NOTE(artom) Apply followed by drop looks weird, but apply

10165 # just saves the new fields while drop actually removes the

10166 # migration context from the instance.

10167 instance.apply_migration_context()

10168 instance.drop_migration_context()

10169 instance.host = self.host

10170 instance.power_state = current_power_state

10171 instance.task_state = None

10172 instance.node = node_name

10173 instance.compute_id = compute_node and compute_node.id or None

10174 instance.progress = 0

10175 instance.save(expected_task_state=task_states.MIGRATING)

10176

10177 # NOTE(tr3buchet): tear down networks on source host (nova-net)

10178 # NOTE(mriedem): For neutron, this will delete any inactive source

10179 # host port bindings.

10180 try:

10181 self.network_api.setup_networks_on_host(context, instance,

10182 prev_host, teardown=True)

10183 except exception.PortBindingDeletionFailed as e:

10184 # Removing the inactive port bindings from the source host is not

10185 # critical so just log an error but don't fail.

10186 LOG.error('Network cleanup failed for source host %s during post '

10187 'live migration. You may need to manually clean up '

10188 'resources in the network service. Error: %s',

10189 prev_host, str(e))

10190 # NOTE(vish): this is necessary to update dhcp for nova-network

10191 # NOTE(mriedem): This is a no-op for neutron.

10192 self.network_api.setup_networks_on_host(context, instance, self.host)

10193 self._notify_about_instance_usage(

10194 context, instance, "live_migration.post.dest.end",

10195 network_info=network_info)

10196 compute_utils.notify_about_instance_action(context, instance,

10197 self.host,

10198 action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST,

10199 phase=fields.NotificationPhase.END)

10200

10201 def _remove_remote_volume_connections(self, context, dest, bdms, instance):

10202 """Rollback remote volume connections on the dest"""

10203 for bdm in bdms:

10204 try:

10205 # remove the connection on the destination host

10206 # NOTE(lyarwood): This actually calls the cinderv2

10207 # os-terminate_connection API if required.

10208 self.compute_rpcapi.remove_volume_connection(

10209 context, instance, bdm.volume_id, dest)

10210 except Exception:

10211 LOG.warning("Ignoring exception while attempting "

10212 "to rollback volume connections for "

10213 "volume %s on host %s.", bdm.volume_id,

10214 dest, instance=instance)

10215

10216 def _rollback_volume_bdms(self, context, bdms, original_bdms, instance):

10217 """Rollback the connection_info and attachment_id for each bdm"""

10218 original_bdms_by_volid = {bdm.volume_id: bdm for bdm in original_bdms

10219 if bdm.is_volume}

10220 for bdm in bdms:

10221 try:

10222 original_bdm = original_bdms_by_volid[bdm.volume_id]

10223 # NOTE(lyarwood): Only delete the referenced attachment if it

10224 # is different to the original in order to avoid accidentally

10225 # removing the source host volume attachment after it has

10226 # already been rolled back by a failure in pre_live_migration.

10227 if (bdm.attachment_id and original_bdm.attachment_id and

10228 bdm.attachment_id != original_bdm.attachment_id):

10229 # NOTE(lyarwood): 3.44 cinder api flow. Delete the

10230 # attachment used by the bdm and reset it to that of

10231 # the original bdm.

10232 self.volume_api.attachment_delete(context,

10233 bdm.attachment_id)

10234 bdm.attachment_id = original_bdm.attachment_id

10235 # NOTE(lyarwood): Reset the connection_info to the original

10236 bdm.connection_info = original_bdm.connection_info

10237 bdm.save()

10238 except cinder_exception.ClientException:

10239 LOG.warning("Ignoring cinderclient exception when "

10240 "attempting to delete attachment %s for volume "

10241 "%s while rolling back volume bdms.",

10242 bdm.attachment_id, bdm.volume_id,

10243 instance=instance)

10244 except Exception:

10245 with excutils.save_and_reraise_exception():

10246 LOG.exception("Exception while attempting to rollback "

10247 "BDM for volume %s.", bdm.volume_id,

10248 instance=instance)

10249

10250 @wrap_exception()

10251 @wrap_instance_fault

10252 def _rollback_live_migration(self, context, instance,

10253 dest, migrate_data=None,

10254 migration_status='failed',

10255 source_bdms=None,

10256 pre_live_migration=False):

10257 """Recovers Instance/volume state from migrating -> running.

10258

10259 :param context: security context

10260 :param instance: nova.objects.instance.Instance object

10261 :param dest:

10262 This method is called from live migration src host.

10263 This param specifies destination host.

10264 :param migrate_data:

10265 if not none, contains implementation specific data.

10266 :param migration_status:

10267 Contains the status we want to set for the migration object

10268 :param source_bdms: BDMs prior to modification by the destination

10269 compute host. Set by _do_live_migration and not

10270 part of the callback interface, so this is never

10271 None

10272

10273 """

10274 # NOTE(gibi): We need to refresh pci_requests of the instance as it

10275 # might be changed by the conductor during scheduling based on the

10276 # selected destination host. If the instance has SRIOV ports with

10277 # resource request then the LiveMigrationTask._find_destination call

10278 # updated the instance.pci_requests.requests[].spec with the SRIOV PF

10279 # device name to be used on the destination host. As the migration is

10280 # rolling back to the source host now we don't want to persist the

10281 # destination host related changes in the DB.

10282 instance.pci_requests = \

10283 objects.InstancePCIRequests.get_by_instance_uuid(

10284 context, instance.uuid)

10285

10286 if (isinstance(migrate_data, migrate_data_obj.LiveMigrateData) and

10287 migrate_data.obj_attr_is_set('migration')):

10288 migration = migrate_data.migration

10289 else:

10290 migration = None

10291

10292 if migration:

10293 # Remove allocations created in Placement for the dest node.

10294 # If migration is None, the virt driver didn't pass it which is

10295 # a bug.

10296 self._revert_allocation(context, instance, migration)

10297 else:

10298 LOG.error('Unable to revert allocations during live migration '

10299 'rollback; compute driver did not provide migrate_data',

10300 instance=instance)

10301

10302 # NOTE(tr3buchet): setup networks on source host (really it's re-setup

10303 # for nova-network)

10304 # NOTE(mriedem): This is a no-op for neutron.

10305 self.network_api.setup_networks_on_host(context, instance, self.host)

10306

10307 # NOTE(erlon): We should make sure that rollback_live_migration_at_src

10308 # is not called in the pre_live_migration rollback as that will trigger

10309 # the src host to re-attach interfaces which were not detached

10310 # previously.

10311 if not pre_live_migration:

10312 self.driver.rollback_live_migration_at_source(context, instance,

10313 migrate_data)

10314

10315 # NOTE(lyarwood): Fetch the current list of BDMs, disconnect any

10316 # connected volumes from the dest and delete any volume attachments

10317 # used by the destination host before rolling back to the original

10318 # still valid source host volume attachments.

10319 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

10320 context, instance.uuid)

10321 # TODO(lyarwood): Turn the following into a lookup method within

10322 # BlockDeviceMappingList.

10323 vol_bdms = [bdm for bdm in bdms if bdm.is_volume]

10324 self._remove_remote_volume_connections(context, dest, vol_bdms,

10325 instance)

10326 self._rollback_volume_bdms(context, vol_bdms, source_bdms, instance)

10327

10328 self._notify_about_instance_usage(context, instance,

10329 "live_migration._rollback.start")

10330 compute_utils.notify_about_instance_action(context, instance,

10331 self.host,

10332 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,

10333 phase=fields.NotificationPhase.START,

10334 bdms=bdms)

10335

10336 do_cleanup, destroy_disks = self._live_migration_cleanup_flags(

10337 migrate_data, migr_ctxt=instance.migration_context)

10338

10339 if do_cleanup:

10340 self.compute_rpcapi.rollback_live_migration_at_destination(

10341 context, instance, dest, destroy_disks=destroy_disks,

10342 migrate_data=migrate_data)

10343 else:

10344 # The port binding profiles need to be cleaned up.

10345 with errors_out_migration_ctxt(migration):

10346 try:

10347 # This call will delete any inactive destination host

10348 # port bindings.

10349 self.network_api.setup_networks_on_host(

10350 context, instance, host=dest, teardown=True)

10351 except exception.PortBindingDeletionFailed as e:

10352 # Removing the inactive port bindings from the destination

10353 # host is not critical so just log an error but don't fail.

10354 LOG.error(

10355 'Network cleanup failed for destination host %s '

10356 'during live migration rollback. You may need to '

10357 'manually clean up resources in the network service. '

10358 'Error: %s', dest, str(e))

10359 except Exception:

10360 with excutils.save_and_reraise_exception():

10361 LOG.exception(

10362 'An error occurred while cleaning up networking '

10363 'during live migration rollback.',

10364 instance=instance)

10365

10366 # NOTE(luyao): We drop move_claim and migration_context after cleanup

10367 # is complete, to ensure the specific resources claimed on destination

10368 # are released safely.

10369 # TODO(artom) drop_move_claim_at_destination() is new in RPC 5.3, only

10370 # call it if we performed a NUMA-aware live migration (which implies us

10371 # being able to send RPC 5.3). To check this, we can use the

10372 # src_supports_numa_live_migration flag, as it will be set if and only

10373 # if:

10374 # - dst_supports_numa_live_migration made its way to the source

10375 # (meaning both dest and source are new and conductor can speak

10376 # RPC 5.3)

10377 # - src_supports_numa_live_migration was set by the source driver and

10378 # passed the send-RPC-5.3 check.

10379 # This check can be removed in RPC 6.0.

10380 if ('src_supports_numa_live_migration' in migrate_data and

10381 migrate_data.src_supports_numa_live_migration):

10382 LOG.debug('Calling destination to drop move claim.',

10383 instance=instance)

10384 self.compute_rpcapi.drop_move_claim_at_destination(context,

10385 instance, dest)

10386

10387 # NOTE(luyao): We only update instance info after rollback operations

10388 # are complete

10389 instance.task_state = None

10390 instance.progress = 0

10391 instance.drop_migration_context()

10392 instance.save(expected_task_state=[task_states.MIGRATING])

10393

10394 self._notify_about_instance_usage(context, instance,

10395 "live_migration._rollback.end")

10396 compute_utils.notify_about_instance_action(context, instance,

10397 self.host,

10398 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK,

10399 phase=fields.NotificationPhase.END,

10400 bdms=bdms)

10401

10402 # NOTE(luyao): we have cleanup everything and get instance

10403 # back to normal status, now set migration status to 'failed'

10404 self._set_migration_status(migration, migration_status)

10405

10406 @wrap_exception()

10407 @wrap_instance_fault

10408 def drop_move_claim_at_destination(self, context, instance):

10409 """Called by the source of a live migration during rollback to ask the

10410 destination to drop the MoveClaim object that was created for the live

10411 migration on the destination.

10412 """

10413 nodename = self._get_nodename(instance)

10414 LOG.debug('Dropping live migration resource claim on destination '

10415 'node %s', nodename, instance=instance)

10416 self.rt.drop_move_claim(

10417 context, instance, nodename, flavor=instance.flavor)

10418

10419 @wrap_exception()

10420 @wrap_instance_event(prefix='compute')

10421 @wrap_instance_fault

10422 def rollback_live_migration_at_destination(self, context, instance,

10423 destroy_disks,

10424 migrate_data):

10425 """Cleaning up image directory that is created pre_live_migration.

10426

10427 :param context: security context

10428 :param instance: a nova.objects.instance.Instance object sent over rpc

10429 :param destroy_disks: whether to destroy volumes or not

10430 :param migrate_data: contains migration info

10431 """

10432 network_info = self.network_api.get_instance_nw_info(context, instance)

10433 self._notify_about_instance_usage(

10434 context, instance, "live_migration.rollback.dest.start",

10435 network_info=network_info)

10436 compute_utils.notify_about_instance_action(

10437 context, instance, self.host,

10438 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,

10439 phase=fields.NotificationPhase.START)

10440 try:

10441 # NOTE(tr3buchet): tear down networks on dest host (nova-net)

10442 # NOTE(mriedem): For neutron, this call will delete any

10443 # destination host port bindings.

10444 # TODO(mriedem): We should eventually remove this call from

10445 # this method (rollback_live_migration_at_destination) since this

10446 # method is only called conditionally based on whether or not the

10447 # instance is running on shared storage. _rollback_live_migration

10448 # already calls this method for neutron if we are running on

10449 # shared storage.

10450 self.network_api.setup_networks_on_host(context, instance,

10451 self.host, teardown=True)

10452 except exception.PortBindingDeletionFailed as e:

10453 # Removing the inactive port bindings from the destination

10454 # host is not critical so just log an error but don't fail.

10455 LOG.error(

10456 'Network cleanup failed for destination host %s '

10457 'during live migration rollback. You may need to '

10458 'manually clean up resources in the network service. '

10459 'Error: %s', self.host, str(e))

10460 except Exception:

10461 with excutils.save_and_reraise_exception():

10462 # NOTE(tdurakov): even if teardown networks fails driver

10463 # should try to rollback live migration on destination.

10464 LOG.exception('An error occurred while deallocating network.',

10465 instance=instance)

10466 finally:

10467 # always run this even if setup_networks_on_host fails

10468 # NOTE(vish): The mapping is passed in so the driver can disconnect

10469 # from remote volumes if necessary

10470 block_device_info = self._get_instance_block_device_info(context,

10471 instance)

10472 # free any instance PCI claims done on destination during

10473 # check_can_live_migrate_destination()

10474 self.rt.free_pci_device_claims_for_instance(context, instance)

10475

10476 # NOTE(luyao): Apply migration_context temporarily since it's

10477 # on destination host, we rely on instance object to cleanup

10478 # specific resources like vpmem

10479 with instance.mutated_migration_context():

10480 self.driver.rollback_live_migration_at_destination(

10481 context, instance, network_info, block_device_info,

10482 destroy_disks=destroy_disks, migrate_data=migrate_data)

10483

10484 self._notify_about_instance_usage(

10485 context, instance, "live_migration.rollback.dest.end",

10486 network_info=network_info)

10487 compute_utils.notify_about_instance_action(

10488 context, instance, self.host,

10489 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST,

10490 phase=fields.NotificationPhase.END)

10491

10492 def _require_nw_info_update(self, context, instance):

10493 """Detect whether there is a mismatch in binding:host_id, or

10494 binding_failed or unbound binding:vif_type for any of the instances

10495 ports.

10496 """

10497 # Only update port bindings if compute manager does manage port

10498 # bindings instead of the compute driver. For example IronicDriver

10499 # manages the port binding for baremetal instance ports, hence,

10500 # external intervention with the binding is not desired.

10501 if self.driver.manages_network_binding_host_id():

10502 return False

10503

10504 search_opts = {'device_id': instance.uuid,

10505 'fields': ['binding:host_id', 'binding:vif_type']}

10506 ports = self.network_api.list_ports(context, **search_opts)

10507 for p in ports['ports']:

10508 if p.get('binding:host_id') != self.host:

10509 return True

10510 vif_type = p.get('binding:vif_type')

10511 if (vif_type == network_model.VIF_TYPE_UNBOUND or

10512 vif_type == network_model.VIF_TYPE_BINDING_FAILED):

10513 return True

10514 return False

10515

10516 @periodic_task.periodic_task(

10517 spacing= CONF.heal_instance_info_cache_interval

10518 if CONF.heal_instance_info_cache_interval != 0

10519 else -1)

10520 def _heal_instance_info_cache(self, context):

10521 """Called periodically. On every call, try to update the

10522 info_cache's network information for another instance by

10523 calling to the network manager.

10524

10525 This is implemented by keeping a cache of uuids of instances

10526 that live on this host. On each call, we pop one off of a

10527 list, pull the DB record, and try the call to the network API.

10528 If anything errors don't fail, as it's possible the instance

10529 has been deleted, etc.

10530 """

10531 instance_uuids = getattr(self, '_instance_uuids_to_heal', [])

10532 instance = None

10533

10534 LOG.debug('Starting heal instance info cache')

10535

10536 if not instance_uuids:

10537 # The list of instances to heal is empty so rebuild it

10538 LOG.debug('Rebuilding the list of instances to heal')

10539 db_instances = objects.InstanceList.get_by_host(

10540 context, self.host, expected_attrs=[], use_slave=True)

10541 for inst in db_instances:

10542 # We don't want to refresh the cache for instances

10543 # which are building or deleting so don't put them

10544 # in the list. If they are building they will get

10545 # added to the list next time we build it.

10546 if (inst.vm_state == vm_states.BUILDING):

10547 LOG.debug('Skipping network cache update for instance '

10548 'because it is Building.', instance=inst)

10549 continue

10550 if (inst.task_state == task_states.DELETING):

10551 LOG.debug('Skipping network cache update for instance '

10552 'because it is being deleted.', instance=inst)

10553 continue

10554

10555 if not instance:

10556 # Save the first one we find so we don't

10557 # have to get it again

10558 instance = inst

10559 else:

10560 instance_uuids.append(inst['uuid'])

10561

10562 self._instance_uuids_to_heal = instance_uuids

10563 else:

10564 # Find the next valid instance on the list

10565 while instance_uuids: 10565 ↛ 10589line 10565 didn't jump to line 10589 because the condition on line 10565 was always true

10566 try:

10567 inst = objects.Instance.get_by_uuid(

10568 context, instance_uuids.pop(0),

10569 expected_attrs=['system_metadata', 'info_cache',

10570 'flavor'],

10571 use_slave=True)

10572 except exception.InstanceNotFound:

10573 # Instance is gone. Try to grab another.

10574 continue

10575

10576 # Check the instance hasn't been migrated

10577 if inst.host != self.host:

10578 LOG.debug('Skipping network cache update for instance '

10579 'because it has been migrated to another '

10580 'host.', instance=inst)

10581 # Check the instance isn't being deleting

10582 elif inst.task_state == task_states.DELETING:

10583 LOG.debug('Skipping network cache update for instance '

10584 'because it is being deleted.', instance=inst)

10585 else:

10586 instance = inst

10587 break

10588

10589 if instance:

10590 # We have an instance now to refresh

10591 try:

10592 # Fix potential mismatch in port binding if evacuation failed

10593 # after reassigning the port binding to the dest host but

10594 # before the instance host is changed.

10595 # Do this only when instance has no pending task.

10596 if instance.task_state is None and \

10597 self._require_nw_info_update(context, instance):

10598 LOG.info("Updating ports in neutron", instance=instance)

10599 self.network_api.setup_instance_network_on_host(

10600 context, instance, self.host)

10601 # Call to network API to get instance info.. this will

10602 # force an update to the instance's info_cache

10603 self.network_api.get_instance_nw_info(

10604 context, instance, force_refresh=True)

10605 LOG.debug('Updated the network info_cache for instance',

10606 instance=instance)

10607 except exception.InstanceNotFound:

10608 # Instance is gone.

10609 LOG.debug('Instance no longer exists. Unable to refresh',

10610 instance=instance)

10611 return

10612 except exception.InstanceInfoCacheNotFound:

10613 # InstanceInfoCache is gone.

10614 LOG.debug('InstanceInfoCache no longer exists. '

10615 'Unable to refresh', instance=instance)

10616 except Exception:

10617 LOG.error('An error occurred while refreshing the network '

10618 'cache.', instance=instance, exc_info=True)

10619 else:

10620 LOG.debug("Didn't find any instances for network info cache "

10621 "update.")

10622

10623 @periodic_task.periodic_task

10624 def _poll_rebooting_instances(self, context):

10625 if CONF.reboot_timeout > 0:

10626 filters = {'task_state':

10627 [task_states.REBOOTING,

10628 task_states.REBOOT_STARTED,

10629 task_states.REBOOT_PENDING],

10630 'host': self.host}

10631 rebooting = objects.InstanceList.get_by_filters(

10632 context, filters, expected_attrs=[], use_slave=True)

10633

10634 to_poll = []

10635 for instance in rebooting:

10636 if timeutils.is_older_than(instance.updated_at, 10636 ↛ 10635line 10636 didn't jump to line 10635 because the condition on line 10636 was always true

10637 CONF.reboot_timeout):

10638 to_poll.append(instance)

10639

10640 self.driver.poll_rebooting_instances(CONF.reboot_timeout, to_poll)

10641

10642 @periodic_task.periodic_task

10643 def _poll_rescued_instances(self, context):

10644 if CONF.rescue_timeout > 0:

10645 filters = {'vm_state': vm_states.RESCUED,

10646 'host': self.host}

10647 rescued_instances = objects.InstanceList.get_by_filters(

10648 context, filters, expected_attrs=["system_metadata"],

10649 use_slave=True)

10650

10651 to_unrescue = []

10652 for instance in rescued_instances:

10653 if timeutils.is_older_than(instance.launched_at,

10654 CONF.rescue_timeout):

10655 to_unrescue.append(instance)

10656

10657 for instance in to_unrescue:

10658 self.compute_api.unrescue(context, instance)

10659

10660 @periodic_task.periodic_task

10661 def _poll_unconfirmed_resizes(self, context):

10662 if CONF.resize_confirm_window == 0:

10663 return

10664

10665 migrations = objects.MigrationList.get_unconfirmed_by_dest_compute(

10666 context, CONF.resize_confirm_window, self.host,

10667 use_slave=True)

10668

10669 migrations_info = dict(migration_count=len(migrations),

10670 confirm_window=CONF.resize_confirm_window)

10671

10672 if migrations_info["migration_count"] > 0: 10672 ↛ 10677line 10672 didn't jump to line 10677 because the condition on line 10672 was always true

10673 LOG.info("Found %(migration_count)d unconfirmed migrations "

10674 "older than %(confirm_window)d seconds",

10675 migrations_info)

10676

10677 def _set_migration_to_error(migration, reason, **kwargs):

10678 LOG.warning("Setting migration %(migration_id)s to error: "

10679 "%(reason)s",

10680 {'migration_id': migration.id, 'reason': reason},

10681 **kwargs)

10682 migration.status = 'error'

10683 migration.save()

10684

10685 for migration in migrations:

10686 instance_uuid = migration.instance_uuid

10687 LOG.info("Automatically confirming migration "

10688 "%(migration_id)s for instance %(instance_uuid)s",

10689 {'migration_id': migration.id,

10690 'instance_uuid': instance_uuid})

10691 expected_attrs = ['metadata', 'system_metadata']

10692 try:

10693 instance = objects.Instance.get_by_uuid(context,

10694 instance_uuid, expected_attrs=expected_attrs,

10695 use_slave=True)

10696 except exception.InstanceNotFound:

10697 reason = (_("Instance %s not found") %

10698 instance_uuid)

10699 _set_migration_to_error(migration, reason)

10700 continue

10701 if instance.vm_state == vm_states.ERROR:

10702 reason = _("In ERROR state")

10703 _set_migration_to_error(migration, reason,

10704 instance=instance)

10705 continue

10706 # race condition: The instance in DELETING state should not be

10707 # set the migration state to error, otherwise the instance in

10708 # to be deleted which is in RESIZED state

10709 # will not be able to confirm resize

10710 if instance.task_state in [task_states.DELETING,

10711 task_states.SOFT_DELETING]:

10712 msg = ("Instance being deleted or soft deleted during resize "

10713 "confirmation. Skipping.")

10714 LOG.debug(msg, instance=instance)

10715 continue

10716

10717 # race condition: This condition is hit when this method is

10718 # called between the save of the migration record with a status of

10719 # finished and the save of the instance object with a state of

10720 # RESIZED. The migration record should not be set to error.

10721 if instance.task_state == task_states.RESIZE_FINISH:

10722 msg = ("Instance still resizing during resize "

10723 "confirmation. Skipping.")

10724 LOG.debug(msg, instance=instance)

10725 continue

10726

10727 vm_state = instance.vm_state

10728 task_state = instance.task_state

10729 if vm_state != vm_states.RESIZED or task_state is not None:

10730 reason = (_("In states %(vm_state)s/%(task_state)s, not "

10731 "RESIZED/None") %

10732 {'vm_state': vm_state,

10733 'task_state': task_state})

10734 _set_migration_to_error(migration, reason,

10735 instance=instance)

10736 continue

10737 try:

10738 self.compute_api.confirm_resize(context, instance,

10739 migration=migration)

10740 except Exception as e:

10741 LOG.info("Error auto-confirming resize: %s. "

10742 "Will retry later.", e, instance=instance)

10743

10744 @periodic_task.periodic_task(spacing=CONF.shelved_poll_interval)

10745 def _poll_shelved_instances(self, context):

10746

10747 if CONF.shelved_offload_time <= 0:

10748 return

10749

10750 filters = {'vm_state': vm_states.SHELVED,

10751 'task_state': None,

10752 'host': self.host}

10753 shelved_instances = objects.InstanceList.get_by_filters(

10754 context, filters=filters, expected_attrs=['system_metadata'],

10755 use_slave=True)

10756

10757 to_gc = []

10758 for instance in shelved_instances:

10759 sys_meta = instance.system_metadata

10760 shelved_at = timeutils.parse_strtime(sys_meta['shelved_at'])

10761 if timeutils.is_older_than(shelved_at, CONF.shelved_offload_time):

10762 to_gc.append(instance)

10763

10764 cyclient = cyborg.get_client(context)

10765 for instance in to_gc:

10766 try:

10767 instance.task_state = task_states.SHELVING_OFFLOADING

10768 instance.save(expected_task_state=(None,))

10769 accel_uuids = []

10770 if instance.flavor.extra_specs.get('accel:device_profile'):

10771 # TODO(brinzhang): After cyborg support batch query ARQs

10772 # for more than one instances, we will improve efficiency

10773 # with this implementation.

10774 accel_uuids = cyclient.get_arq_uuids_for_instance(instance)

10775 self.shelve_offload_instance(

10776 context, instance, clean_shutdown=False,

10777 accel_uuids=accel_uuids)

10778 except Exception:

10779 LOG.exception('Periodic task failed to offload instance.',

10780 instance=instance)

10781

10782 @periodic_task.periodic_task

10783 def _instance_usage_audit(self, context):

10784 if not CONF.instance_usage_audit:

10785 return

10786

10787 begin, end = utils.last_completed_audit_period()

10788 if objects.TaskLog.get(context, 'instance_usage_audit', begin, end, 10788 ↛ 10790line 10788 didn't jump to line 10790 because the condition on line 10788 was never true

10789 self.host):

10790 return

10791

10792 instances = objects.InstanceList.get_active_by_window_joined(

10793 context, begin, end, host=self.host,

10794 expected_attrs=['system_metadata', 'info_cache', 'metadata',

10795 'flavor'],

10796 use_slave=True)

10797 num_instances = len(instances)

10798 errors = 0

10799 successes = 0

10800 LOG.info("Running instance usage audit for host %(host)s "

10801 "from %(begin_time)s to %(end_time)s. "

10802 "%(number_instances)s instances.",

10803 {'host': self.host,

10804 'begin_time': begin,

10805 'end_time': end,

10806 'number_instances': num_instances})

10807 start_time = time.time()

10808 task_log = objects.TaskLog(context)

10809 task_log.task_name = 'instance_usage_audit'

10810 task_log.period_beginning = begin

10811 task_log.period_ending = end

10812 task_log.host = self.host

10813 task_log.task_items = num_instances

10814 task_log.message = 'Instance usage audit started...'

10815 task_log.begin_task()

10816 for instance in instances:

10817 try:

10818 compute_utils.notify_usage_exists(

10819 self.notifier, context, instance, self.host,

10820 ignore_missing_network_data=False)

10821 successes += 1

10822 except Exception:

10823 LOG.exception('Failed to generate usage '

10824 'audit for instance '

10825 'on host %s', self.host,

10826 instance=instance)

10827 errors += 1

10828 task_log.errors = errors

10829 task_log.message = (

10830 'Instance usage audit ran for host %s, %s instances in %s seconds.'

10831 % (self.host, num_instances, time.time() - start_time))

10832 task_log.end_task()

10833

10834 def _get_host_volume_bdms(self, context, use_slave=False):

10835 """Return all block device mappings on a compute host."""

10836 compute_host_bdms = []

10837 instances = objects.InstanceList.get_by_host(context, self.host,

10838 use_slave=use_slave)

10839 for instance in instances:

10840 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

10841 context, instance.uuid, use_slave=use_slave)

10842 instance_bdms = [bdm for bdm in bdms if bdm.is_volume]

10843 compute_host_bdms.append(dict(instance=instance,

10844 instance_bdms=instance_bdms))

10845

10846 return compute_host_bdms

10847

10848 def _update_volume_usage_cache(self, context, vol_usages):

10849 """Updates the volume usage cache table with a list of stats."""

10850 for usage in vol_usages:

10851 # Allow switching of greenthreads between queries.

10852 greenthread.sleep(0)

10853 vol_usage = objects.VolumeUsage(context)

10854 vol_usage.volume_id = usage['volume']

10855 vol_usage.instance_uuid = usage['instance'].uuid

10856 vol_usage.project_id = usage['instance'].project_id

10857 vol_usage.user_id = usage['instance'].user_id

10858 vol_usage.availability_zone = usage['instance'].availability_zone

10859 vol_usage.curr_reads = usage['rd_req']

10860 vol_usage.curr_read_bytes = usage['rd_bytes']

10861 vol_usage.curr_writes = usage['wr_req']

10862 vol_usage.curr_write_bytes = usage['wr_bytes']

10863 vol_usage.save()

10864 self.notifier.info(context, 'volume.usage', vol_usage.to_dict())

10865 compute_utils.notify_about_volume_usage(context, vol_usage,

10866 self.host)

10867

10868 @periodic_task.periodic_task(spacing=CONF.volume_usage_poll_interval)

10869 def _poll_volume_usage(self, context):

10870 if CONF.volume_usage_poll_interval == 0:

10871 return

10872

10873 compute_host_bdms = self._get_host_volume_bdms(context,

10874 use_slave=True)

10875 if not compute_host_bdms:

10876 return

10877

10878 LOG.debug("Updating volume usage cache")

10879 try:

10880 vol_usages = self.driver.get_all_volume_usage(context,

10881 compute_host_bdms)

10882 except NotImplementedError:

10883 return

10884

10885 self._update_volume_usage_cache(context, vol_usages)

10886

10887 @periodic_task.periodic_task(spacing=CONF.sync_power_state_interval,

10888 run_immediately=True)

10889 def _sync_power_states(self, context):

10890 """Align power states between the database and the hypervisor.

10891

10892 To sync power state data we make a DB call to get the number of

10893 virtual machines known by the hypervisor and if the number matches the

10894 number of virtual machines known by the database, we proceed in a lazy

10895 loop, one database record at a time, checking if the hypervisor has the

10896 same power state as is in the database.

10897 """

10898 db_instances = objects.InstanceList.get_by_host(context, self.host,

10899 expected_attrs=[],

10900 use_slave=True)

10901

10902 try:

10903 num_vm_instances = self.driver.get_num_instances()

10904 except exception.VirtDriverNotReady as e:

10905 # If the virt driver is not ready, like ironic-api not being up

10906 # yet in the case of ironic, just log it and exit.

10907 LOG.info('Skipping _sync_power_states periodic task due to: %s', e)

10908 return

10909

10910 num_db_instances = len(db_instances)

10911

10912 if num_vm_instances != num_db_instances:

10913 LOG.warning("While synchronizing instance power states, found "

10914 "%(num_db_instances)s instances in the database "

10915 "and %(num_vm_instances)s instances on the "

10916 "hypervisor.",

10917 {'num_db_instances': num_db_instances,

10918 'num_vm_instances': num_vm_instances})

10919

10920 def _sync(db_instance):

10921 # NOTE(melwitt): This must be synchronized as we query state from

10922 # two separate sources, the driver and the database.

10923 # They are set (in stop_instance) and read, in sync.

10924 @utils.synchronized(db_instance.uuid)

10925 def query_driver_power_state_and_sync():

10926 self._query_driver_power_state_and_sync(context, db_instance)

10927

10928 try:

10929 query_driver_power_state_and_sync()

10930 except Exception:

10931 LOG.exception("Periodic sync_power_state task had an "

10932 "error while processing an instance.",

10933 instance=db_instance)

10934

10935 self._syncs_in_progress.pop(db_instance.uuid)

10936

10937 for db_instance in db_instances:

10938 # process syncs asynchronously - don't want instance locking to

10939 # block entire periodic task thread

10940 uuid = db_instance.uuid

10941 if uuid in self._syncs_in_progress: 10941 ↛ 10942line 10941 didn't jump to line 10942 because the condition on line 10941 was never true

10942 LOG.debug('Sync already in progress for %s', uuid)

10943 else:

10944 LOG.debug('Triggering sync for uuid %s', uuid)

10945 self._syncs_in_progress[uuid] = True

10946 nova.utils.pass_context(self._sync_power_pool.spawn_n,

10947 _sync,

10948 db_instance)

10949

10950 def _query_driver_power_state_and_sync(self, context, db_instance):

10951 if db_instance.task_state is not None:

10952 LOG.info("During sync_power_state the instance has a "

10953 "pending task (%(task)s). Skip.",

10954 {'task': db_instance.task_state}, instance=db_instance)

10955 return

10956 # No pending tasks. Now try to figure out the real vm_power_state.

10957 try:

10958 vm_instance = self.driver.get_info(db_instance)

10959 vm_power_state = vm_instance.state

10960 except exception.InstanceNotFound:

10961 vm_power_state = power_state.NOSTATE

10962 # Note(maoy): the above get_info call might take a long time,

10963 # for example, because of a broken libvirt driver.

10964 try:

10965 self._sync_instance_power_state(context,

10966 db_instance,

10967 vm_power_state,

10968 use_slave=True)

10969 except exception.InstanceNotFound:

10970 # NOTE(hanlind): If the instance gets deleted during sync,

10971 # silently ignore.

10972 pass

10973

10974 def _stop_unexpected_shutdown_instance(self, context, vm_state,

10975 db_instance, orig_db_power_state):

10976 # this is an exceptional case; make sure our data is up

10977 # to date before slamming through a power off

10978 vm_instance = self.driver.get_info(db_instance,

10979 use_cache=False)

10980 vm_power_state = vm_instance.state

10981

10982 # if it still looks off, go ahead and call stop()

10983 if vm_power_state in (power_state.SHUTDOWN, 10983 ↛ exitline 10983 didn't return from function '_stop_unexpected_shutdown_instance' because the condition on line 10983 was always true

10984 power_state.CRASHED):

10985

10986 LOG.warning("Instance shutdown by itself. Calling the "

10987 "stop API. Current vm_state: %(vm_state)s, "

10988 "current task_state: %(task_state)s, "

10989 "original DB power_state: %(db_power_state)s, "

10990 "current VM power_state: %(vm_power_state)s",

10991 {'vm_state': vm_state,

10992 'task_state': db_instance.task_state,

10993 'db_power_state': orig_db_power_state,

10994 'vm_power_state': vm_power_state},

10995 instance=db_instance)

10996 try:

10997 # Note(maoy): here we call the API instead of

10998 # brutally updating the vm_state in the database

10999 # to allow all the hooks and checks to be performed.

11000 if db_instance.shutdown_terminate:

11001 self.compute_api.delete(context, db_instance)

11002 else:

11003 self.compute_api.stop(context, db_instance)

11004 except Exception:

11005 # Note(maoy): there is no need to propagate the error

11006 # because the same power_state will be retrieved next

11007 # time and retried.

11008 # For example, there might be another task scheduled.

11009 LOG.exception("error during stop() in sync_power_state.",

11010 instance=db_instance)

11011

11012 def _sync_instance_power_state(self, context, db_instance, vm_power_state,

11013 use_slave=False):

11014 """Align instance power state between the database and hypervisor.

11015

11016 If the instance is not found on the hypervisor, but is in the database,

11017 then a stop() API will be called on the instance.

11018 """

11019

11020 # We re-query the DB to get the latest instance info to minimize

11021 # (not eliminate) race condition.

11022 db_instance.refresh(use_slave=use_slave)

11023 db_power_state = db_instance.power_state

11024 vm_state = db_instance.vm_state

11025

11026 if self.host != db_instance.host: 11026 ↛ 11034line 11026 didn't jump to line 11034 because the condition on line 11026 was never true

11027 # on the sending end of nova-compute _sync_power_state

11028 # may have yielded to the greenthread performing a live

11029 # migration; this in turn has changed the resident-host

11030 # for the VM; However, the instance is still active, it

11031 # is just in the process of migrating to another host.

11032 # This implies that the compute source must relinquish

11033 # control to the compute destination.

11034 LOG.info("During the sync_power process the "

11035 "instance has moved from "

11036 "host %(src)s to host %(dst)s",

11037 {'src': db_instance.host,

11038 'dst': self.host},

11039 instance=db_instance)

11040 return

11041 elif db_instance.task_state is not None: 11041 ↛ 11047line 11041 didn't jump to line 11047 because the condition on line 11041 was never true

11042 # on the receiving end of nova-compute, it could happen

11043 # that the DB instance already report the new resident

11044 # but the actual VM has not showed up on the hypervisor

11045 # yet. In this case, let's allow the loop to continue

11046 # and run the state sync in a later round

11047 LOG.info("During sync_power_state the instance has a "

11048 "pending task (%(task)s). Skip.",

11049 {'task': db_instance.task_state},

11050 instance=db_instance)

11051 return

11052

11053 orig_db_power_state = db_power_state

11054 if vm_power_state != db_power_state:

11055 LOG.info('During _sync_instance_power_state the DB '

11056 'power_state (%(db_power_state)s) does not match '

11057 'the vm_power_state from the hypervisor '

11058 '(%(vm_power_state)s). Updating power_state in the '

11059 'DB to match the hypervisor.',

11060 {'db_power_state': db_power_state,

11061 'vm_power_state': vm_power_state},

11062 instance=db_instance)

11063 # power_state is always updated from hypervisor to db

11064 db_instance.power_state = vm_power_state

11065 db_instance.save()

11066 db_power_state = vm_power_state

11067

11068 # Note(maoy): Now resolve the discrepancy between vm_state and

11069 # vm_power_state. We go through all possible vm_states.

11070 if vm_state in (vm_states.BUILDING, 11070 ↛ 11076line 11070 didn't jump to line 11076 because the condition on line 11070 was never true

11071 vm_states.RESCUED,

11072 vm_states.RESIZED,

11073 vm_states.SUSPENDED,

11074 vm_states.ERROR):

11075 # TODO(maoy): we ignore these vm_state for now.

11076 pass

11077 elif vm_state == vm_states.ACTIVE:

11078 # The only rational power state should be RUNNING

11079 if vm_power_state in (power_state.SHUTDOWN,

11080 power_state.CRASHED):

11081 self._stop_unexpected_shutdown_instance(

11082 context, vm_state, db_instance, orig_db_power_state)

11083 elif vm_power_state == power_state.SUSPENDED:

11084 LOG.warning("Instance is suspended unexpectedly. Calling "

11085 "the stop API.", instance=db_instance)

11086 try:

11087 self.compute_api.stop(context, db_instance)

11088 except Exception:

11089 LOG.exception("error during stop() in sync_power_state.",

11090 instance=db_instance)

11091 elif vm_power_state == power_state.PAUSED:

11092 # Note(maoy): a VM may get into the paused state not only

11093 # because the user request via API calls, but also

11094 # due to (temporary) external instrumentations.

11095 # Before the virt layer can reliably report the reason,

11096 # we simply ignore the state discrepancy. In many cases,

11097 # the VM state will go back to running after the external

11098 # instrumentation is done. See bug 1097806 for details.

11099 LOG.warning("Instance is paused unexpectedly. Ignore.",

11100 instance=db_instance)

11101 elif vm_power_state == power_state.NOSTATE:

11102 # Occasionally, depending on the status of the hypervisor,

11103 # which could be restarting for example, an instance may

11104 # not be found. Therefore just log the condition.

11105 LOG.warning("Instance is unexpectedly not found. Ignore.",

11106 instance=db_instance)

11107 elif vm_state == vm_states.STOPPED:

11108 if vm_power_state not in (power_state.NOSTATE, 11108 ↛ exitline 11108 didn't return from function '_sync_instance_power_state' because the condition on line 11108 was always true

11109 power_state.SHUTDOWN,

11110 power_state.CRASHED):

11111 LOG.warning("Instance is not stopped. Calling "

11112 "the stop API. Current vm_state: %(vm_state)s,"

11113 " current task_state: %(task_state)s, "

11114 "original DB power_state: %(db_power_state)s, "

11115 "current VM power_state: %(vm_power_state)s",

11116 {'vm_state': vm_state,

11117 'task_state': db_instance.task_state,

11118 'db_power_state': orig_db_power_state,

11119 'vm_power_state': vm_power_state},

11120 instance=db_instance)

11121 try:

11122 # NOTE(russellb) Force the stop, because normally the

11123 # compute API would not allow an attempt to stop a stopped

11124 # instance.

11125 self.compute_api.force_stop(context, db_instance)

11126 except Exception:

11127 LOG.exception("error during stop() in sync_power_state.",

11128 instance=db_instance)

11129 elif vm_state == vm_states.PAUSED:

11130 if vm_power_state in (power_state.SHUTDOWN, 11130 ↛ exitline 11130 didn't return from function '_sync_instance_power_state' because the condition on line 11130 was always true

11131 power_state.CRASHED):

11132 LOG.warning("Paused instance shutdown by itself. Calling "

11133 "the stop API.", instance=db_instance)

11134 try:

11135 self.compute_api.force_stop(context, db_instance)

11136 except Exception:

11137 LOG.exception("error during stop() in sync_power_state.",

11138 instance=db_instance)

11139 elif vm_state in (vm_states.SOFT_DELETED, 11139 ↛ exitline 11139 didn't return from function '_sync_instance_power_state' because the condition on line 11139 was always true

11140 vm_states.DELETED):

11141 if vm_power_state not in (power_state.NOSTATE, 11141 ↛ 11145line 11141 didn't jump to line 11145 because the condition on line 11141 was never true

11142 power_state.SHUTDOWN):

11143 # Note(maoy): this should be taken care of periodically in

11144 # _cleanup_running_deleted_instances().

11145 LOG.warning("Instance is not (soft-)deleted.",

11146 instance=db_instance)

11147

11148 @periodic_task.periodic_task

11149 def _reclaim_queued_deletes(self, context):

11150 """Reclaim instances that are queued for deletion."""

11151 interval = CONF.reclaim_instance_interval

11152 if interval <= 0:

11153 LOG.debug("CONF.reclaim_instance_interval <= 0, skipping...")

11154 return

11155

11156 filters = {'vm_state': vm_states.SOFT_DELETED,

11157 'task_state': None,

11158 'host': self.host}

11159 instances = objects.InstanceList.get_by_filters(

11160 context, filters,

11161 expected_attrs=objects.instance.INSTANCE_DEFAULT_FIELDS,

11162 use_slave=True)

11163 for instance in instances:

11164 if self._deleted_old_enough(instance, interval):

11165 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

11166 context, instance.uuid)

11167 LOG.info('Reclaiming deleted instance', instance=instance)

11168 try:

11169 self._delete_instance(context, instance, bdms)

11170 except Exception as e:

11171 LOG.warning("Periodic reclaim failed to delete "

11172 "instance: %s",

11173 e, instance=instance)

11174

11175 def _get_nodename(self, instance, refresh=False):

11176 """Helper method to get the name of the first available node

11177 on this host. This method should not be used with any operations

11178 on ironic instances since it does not handle multiple nodes.

11179 """

11180 node = self.driver.get_available_nodes(refresh=refresh)[0]

11181 LOG.debug("No node specified, defaulting to %s", node,

11182 instance=instance)

11183 return node

11184

11185 def _update_available_resource_for_node(self, context, nodename,

11186 startup=False):

11187

11188 try:

11189 self.rt.update_available_resource(context, nodename,

11190 startup=startup)

11191 except exception.ComputeHostNotFound:

11192 LOG.warning("Compute node '%s' not found in "

11193 "update_available_resource.", nodename)

11194 except exception.ReshapeFailed:

11195 # We're only supposed to get here on startup, if a reshape was

11196 # needed, was attempted, and failed. We want to kill the service.

11197 with excutils.save_and_reraise_exception():

11198 LOG.critical("Resource provider data migration failed "

11199 "fatally during startup for node %s.", nodename)

11200 except exception.ReshapeNeeded:

11201 # This exception should only find its way here if the virt driver's

11202 # update_provider_tree raised it incorrectly: either

11203 # a) After the resource tracker already caught it once and

11204 # reinvoked update_provider_tree with allocations. At this point

11205 # the driver is just supposed to *do* the reshape, so if it raises

11206 # ReshapeNeeded, it's a bug, and we want to kill the compute

11207 # service.

11208 # b) On periodic rather than startup (we only allow reshapes to

11209 # happen on startup). In this case we'll just make the logs red and

11210 # go again at the next periodic interval, where the same thing may

11211 # or may not happen again. Depending on the previous and intended

11212 # shape of the providers/inventories, this may not actually cause

11213 # any immediately visible symptoms (in terms of scheduling, etc.)

11214 # If this becomes a problem, we may wish to make it pop immediately

11215 # (e.g. disable the service).

11216 with excutils.save_and_reraise_exception():

11217 LOG.exception("ReshapeNeeded exception is unexpected here!")

11218 except exception.PlacementPciException:

11219 # If we are at startup and the Placement PCI inventory handling

11220 # failed then probably there is a configuration error. Propagate

11221 # the error up to kill the service.

11222 if startup:

11223 raise

11224 # If we are not at startup then we can assume that the

11225 # configuration was correct at startup so the error is probably

11226 # transient. Anyhow we cannot kill the service any more so just

11227 # log the error and continue.

11228 LOG.exception(

11229 "Error updating PCI resources for node %(node)s.",

11230 {'node': nodename})

11231 except exception.InvalidConfiguration as e:

11232 if startup:

11233 # If this happens during startup, we need to let it raise to

11234 # abort our service startup.

11235 raise

11236 else:

11237 LOG.error("Error updating resources for node %s: %s",

11238 nodename, e)

11239 except Exception:

11240 LOG.exception("Error updating resources for node %(node)s.",

11241 {'node': nodename})

11242

11243 @periodic_task.periodic_task(spacing=CONF.update_resources_interval)

11244 def update_available_resource(self, context, startup=False):

11245 """See driver.get_available_resource()

11246

11247 Periodic process that keeps that the compute host's understanding of

11248 resource availability and usage in sync with the underlying hypervisor.

11249

11250 :param context: security context

11251 :param startup: True if this is being called when the nova-compute

11252 service is starting, False otherwise.

11253 """

11254 try:

11255 nodenames = set(self.driver.get_available_nodes())

11256 except exception.VirtDriverNotReady:

11257 LOG.warning("Virt driver is not ready.")

11258 return

11259

11260 compute_nodes_in_db = self._get_compute_nodes_in_db(context,

11261 nodenames,

11262 use_slave=True,

11263 startup=startup)

11264

11265 self.rt.clean_compute_node_cache(compute_nodes_in_db)

11266

11267 # Delete orphan compute node not reported by driver but still in db

11268 for cn in compute_nodes_in_db:

11269 if cn.hypervisor_hostname not in nodenames:

11270 # if the node could be migrated, we don't delete

11271 # the compute node database records

11272 if not self.driver.is_node_deleted(cn.hypervisor_hostname):

11273 LOG.warning(

11274 "Found orphan compute node %(id)s "

11275 "hypervisor host is %(hh)s, "

11276 "nodes are %(nodes)s. "

11277 "We are not deleting this as the driver "

11278 "says this node has not been deleted.",

11279 {'id': cn.id, 'hh': cn.hypervisor_hostname,

11280 'nodes': nodenames})

11281 continue

11282

11283 LOG.info("Deleting orphan compute node %(id)s "

11284 "hypervisor host is %(hh)s, "

11285 "nodes are %(nodes)s",

11286 {'id': cn.id, 'hh': cn.hypervisor_hostname,

11287 'nodes': nodenames})

11288 try:

11289 cn.destroy()

11290 except exception.ObjectActionError:

11291 # NOTE(mgoddard): it's possible that another compute

11292 # service took ownership of this compute node since we

11293 # queried it due to a rebalance, and this will cause the

11294 # deletion to fail. Ignore the error in that case.

11295 LOG.info("Ignoring failure to delete orphan compute node "

11296 "%(id)s on hypervisor host %(hh)s due to "

11297 "possible node rebalance",

11298 {'id': cn.id, 'hh': cn.hypervisor_hostname})

11299 self.rt.remove_node(cn.hypervisor_hostname)

11300 self.reportclient.invalidate_resource_provider(cn.uuid)

11301 else:

11302 self.rt.remove_node(cn.hypervisor_hostname)

11303 # Delete the corresponding resource provider in placement,

11304 # along with any associated allocations.

11305 try:

11306 self.reportclient.delete_resource_provider(

11307 context, cn, cascade=True)

11308 except keystone_exception.ClientException as e:

11309 LOG.error(

11310 "Failed to delete compute node resource provider "

11311 "for compute node %s: %s", cn.uuid, str(e))

11312

11313 for nodename in nodenames:

11314 self._update_available_resource_for_node(context, nodename,

11315 startup=startup)

11316

11317 def _get_compute_nodes_in_db(self, context, nodenames, use_slave=False,

11318 startup=False):

11319 try:

11320 return objects.ComputeNodeList.get_all_by_host(context, self.host,

11321 use_slave=use_slave)

11322 except exception.NotFound:

11323 # If the driver is not reporting any nodenames we should not

11324 # expect there to be compute nodes so we just return in that case.

11325 # For example, this could be an ironic compute and it is not

11326 # managing any nodes yet.

11327 if nodenames:

11328 if startup: 11328 ↛ 11334line 11328 didn't jump to line 11334 because the condition on line 11328 was always true

11329 LOG.warning(

11330 "No compute node record found for host %s. If this is "

11331 "the first time this service is starting on this "

11332 "host, then you can ignore this warning.", self.host)

11333 else:

11334 LOG.error("No compute node record for host %s", self.host)

11335 return []

11336

11337 @periodic_task.periodic_task(

11338 spacing=CONF.running_deleted_instance_poll_interval,

11339 run_immediately=True)

11340 def _cleanup_running_deleted_instances(self, context):

11341 """Cleanup any instances which are erroneously still running after

11342 having been deleted.

11343

11344 Valid actions to take are:

11345

11346 1. noop - do nothing

11347 2. log - log which instances are erroneously running

11348 3. reap - shutdown and cleanup any erroneously running instances

11349 4. shutdown - power off *and disable* any erroneously running

11350 instances

11351

11352 The use-case for this cleanup task is: for various reasons, it may be

11353 possible for the database to show an instance as deleted but for that

11354 instance to still be running on a host machine (see bug

11355 https://bugs.launchpad.net/nova/+bug/911366).

11356

11357 This cleanup task is a cross-hypervisor utility for finding these

11358 zombied instances and either logging the discrepancy (likely what you

11359 should do in production), or automatically reaping the instances (more

11360 appropriate for dev environments).

11361 """

11362 action = CONF.running_deleted_instance_action

11363

11364 if action == "noop": 11364 ↛ 11365line 11364 didn't jump to line 11365 because the condition on line 11364 was never true

11365 return

11366

11367 # NOTE(sirp): admin contexts don't ordinarily return deleted records

11368 with utils.temporary_mutation(context, read_deleted="yes"):

11369

11370 try:

11371 instances = self._running_deleted_instances(context)

11372 except exception.VirtDriverNotReady:

11373 # Since this task runs immediately on startup, if the

11374 # hypervisor is not yet ready handle it gracefully.

11375 LOG.debug('Unable to check for running deleted instances '

11376 'at this time since the hypervisor is not ready.')

11377 return

11378

11379 for instance in instances:

11380 if action == "log": 11380 ↛ 11381line 11380 didn't jump to line 11381 because the condition on line 11380 was never true

11381 LOG.warning("Detected instance with name label "

11382 "'%s' which is marked as "

11383 "DELETED but still present on host.",

11384 instance.name, instance=instance)

11385

11386 elif action == 'shutdown':

11387 LOG.info("Powering off instance with name label "

11388 "'%s' which is marked as "

11389 "DELETED but still present on host.",

11390 instance.name, instance=instance)

11391 try:

11392 self.driver.power_off(context, instance)

11393 except Exception:

11394 LOG.warning("Failed to power off instance",

11395 instance=instance, exc_info=True)

11396

11397 elif action == 'reap': 11397 ↛ 11415line 11397 didn't jump to line 11415 because the condition on line 11397 was always true

11398 LOG.info("Destroying instance with name label "

11399 "'%s' which is marked as "

11400 "DELETED but still present on host.",

11401 instance.name, instance=instance)

11402 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

11403 context, instance.uuid, use_slave=True)

11404 self.instance_events.clear_events_for_instance(instance)

11405 try:

11406 self._shutdown_instance(context, instance, bdms,

11407 notify=False)

11408 self._cleanup_volumes(context, instance, bdms,

11409 detach=False)

11410 except Exception as e:

11411 LOG.warning("Periodic cleanup failed to delete "

11412 "instance: %s",

11413 e, instance=instance)

11414 else:

11415 raise Exception(_("Unrecognized value '%s'"

11416 " for CONF.running_deleted_"

11417 "instance_action") % action)

11418

11419 def _running_deleted_instances(self, context):

11420 """Returns a list of instances nova thinks is deleted,

11421 but the hypervisor thinks is still running.

11422 """

11423 timeout = CONF.running_deleted_instance_timeout

11424 filters = {'deleted': True,

11425 'soft_deleted': False}

11426 instances = self._get_instances_on_driver(context, filters)

11427 return [i for i in instances if self._deleted_old_enough(i, timeout)]

11428

11429 def _deleted_old_enough(self, instance, timeout):

11430 deleted_at = instance.deleted_at

11431 if deleted_at: 11431 ↛ 11433line 11431 didn't jump to line 11433 because the condition on line 11431 was always true

11432 deleted_at = deleted_at.replace(tzinfo=None)

11433 return (not deleted_at or timeutils.is_older_than(deleted_at, timeout))

11434

11435 @contextlib.contextmanager

11436 def _error_out_instance_on_exception(self, context, instance,

11437 instance_state=vm_states.ACTIVE):

11438 """Context manager to set instance.vm_state after some operation raises

11439

11440 Used to handle NotImplementedError and InstanceFaultRollback errors

11441 and reset the instance vm_state and task_state. The vm_state is set

11442 to the $instance_state parameter and task_state is set to None.

11443 For all other types of exceptions, the vm_state is set to ERROR and

11444 the task_state is left unchanged (although most callers will have the

11445 @reverts_task_state decorator which will set the task_state to None).

11446

11447 Re-raises the original exception *except* in the case of

11448 InstanceFaultRollback in which case the wrapped `inner_exception` is

11449 re-raised.

11450

11451 :param context: The nova auth request context for the operation.

11452 :param instance: The instance to update. The vm_state will be set by

11453 this context manager when an exception is raised.

11454 :param instance_state: For NotImplementedError and

11455 InstanceFaultRollback this is the vm_state to set the instance to

11456 when handling one of those types of exceptions. By default the

11457 instance will be set to ACTIVE, but the caller should control this

11458 in case there have been no changes to the running state of the

11459 instance. For example, resizing a stopped server where prep_resize

11460 fails early and does not change the power state of the guest should

11461 not set the instance status to ACTIVE but remain STOPPED.

11462 This parameter is ignored for all other types of exceptions and the

11463 instance vm_state is set to ERROR.

11464 """

11465 # NOTE(mriedem): Why doesn't this method just save off the

11466 # original instance.vm_state here rather than use a parameter? Or use

11467 # instance_state=None as an override but default to the current

11468 # vm_state when rolling back.

11469 instance_uuid = instance.uuid

11470 try:

11471 yield

11472 except (NotImplementedError, exception.InstanceFaultRollback) as error:

11473 # Use reraise=False to determine if we want to raise the original

11474 # exception or something else.

11475 with excutils.save_and_reraise_exception(reraise=False) as ctxt:

11476 LOG.info("Setting instance back to %(state)s after: %(error)s",

11477 {'state': instance_state, 'error': error},

11478 instance_uuid=instance_uuid)

11479 self._instance_update(context, instance,

11480 vm_state=instance_state,

11481 task_state=None)

11482 if isinstance(error, exception.InstanceFaultRollback):

11483 # Raise the wrapped exception.

11484 raise error.inner_exception

11485 # Else re-raise the NotImplementedError.

11486 ctxt.reraise = True

11487 except Exception:

11488 LOG.exception('Setting instance vm_state to ERROR',

11489 instance_uuid=instance_uuid)

11490 with excutils.save_and_reraise_exception():

11491 # NOTE(mriedem): Why don't we pass clean_task_state=True here?

11492 self._set_instance_obj_error_state(instance)

11493

11494 def _process_instance_event(self, instance, event):

11495 _event = self.instance_events.pop_instance_event(instance, event)

11496 if _event:

11497 LOG.debug('Processing event %(event)s',

11498 {'event': event.key}, instance=instance)

11499 _event.send(event)

11500 else:

11501 # If it's a network-vif-unplugged event and the instance is being

11502 # deleted or live migrated then we don't need to make this a

11503 # warning as it's expected. There are other expected things which

11504 # could trigger this event like detaching an interface, but we

11505 # don't have a task state for that.

11506 # TODO(mriedem): We have other move operations and things like

11507 # hard reboot (probably rebuild as well) which trigger this event

11508 # but nothing listens for network-vif-unplugged. We should either

11509 # handle those other known cases or consider just not logging a

11510 # warning if we get this event and the instance is undergoing some

11511 # task state transition.

11512 if (event.name == 'network-vif-unplugged' and

11513 instance.task_state in (

11514 task_states.DELETING, task_states.MIGRATING)):

11515 LOG.debug('Received event %s for instance with task_state %s.',

11516 event.key, instance.task_state, instance=instance)

11517 else:

11518 LOG.warning('Received unexpected event %(event)s for '

11519 'instance with vm_state %(vm_state)s and '

11520 'task_state %(task_state)s.',

11521 {'event': event.key,

11522 'vm_state': instance.vm_state,

11523 'task_state': instance.task_state},

11524 instance=instance)

11525

11526 def _process_instance_vif_deleted_event(self, context, instance,

11527 deleted_vif_id):

11528 # If an attached port is deleted by neutron, it needs to

11529 # be detached from the instance.

11530 # And info cache needs to be updated.

11531 network_info = instance.info_cache.network_info

11532 for index, vif in enumerate(network_info): 11532 ↛ exitline 11532 didn't return from function '_process_instance_vif_deleted_event' because the loop on line 11532 didn't complete

11533 if vif['id'] == deleted_vif_id:

11534 LOG.info('Neutron deleted interface %(intf)s; '

11535 'detaching it from the instance and '

11536 'deleting it from the info cache',

11537 {'intf': vif['id']},

11538 instance=instance)

11539 profile = vif.get('profile', {}) or {} # profile can be None

11540 rps = profile.get('allocation')

11541 if rps: 11541 ↛ 11542line 11541 didn't jump to line 11542 because the condition on line 11541 was never true

11542 if isinstance(rps, dict):

11543 # if extended resource request extension is enabled

11544 # then we have a dict of providers, flatten it for the

11545 # log.

11546 rps = ','.join(rps.values())

11547 LOG.error(

11548 'The bound port %(port_id)s is deleted in Neutron but '

11549 'the resource allocation on the resource providers '

11550 '%(rp_uuid)s are leaked until the server '

11551 '%(server_uuid)s is deleted.',

11552 {'port_id': vif['id'],

11553 'rp_uuid': rps,

11554 'server_uuid': instance.uuid})

11555

11556 del network_info[index]

11557 neutron.update_instance_cache_with_nw_info(

11558 self.network_api, context, instance, nw_info=network_info)

11559 try:

11560 self.driver.detach_interface(context, instance, vif)

11561 except NotImplementedError:

11562 # Not all virt drivers support attach/detach of interfaces

11563 # yet (like Ironic), so just ignore this.

11564 pass

11565 except exception.NovaException as ex:

11566 # If the instance was deleted before the interface was

11567 # detached, just log it at debug.

11568 log_level = (logging.DEBUG

11569 if isinstance(ex, exception.InstanceNotFound)

11570 else logging.WARNING)

11571 LOG.log(log_level,

11572 "Detach interface failed, "

11573 "port_id=%(port_id)s, reason: %(msg)s",

11574 {'port_id': deleted_vif_id, 'msg': ex},

11575 instance=instance)

11576 break

11577

11578 @wrap_instance_event(prefix='compute')

11579 @wrap_instance_fault

11580 def extend_volume(self, context, instance, extended_volume_id):

11581

11582 # If an attached volume is extended by cinder, it needs to

11583 # be extended by virt driver so host can detect its new size.

11584 # And bdm needs to be updated.

11585 LOG.debug('Handling volume-extended event for volume %(vol)s',

11586 {'vol': extended_volume_id}, instance=instance)

11587

11588 try:

11589 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance(

11590 context, extended_volume_id, instance.uuid)

11591 except exception.NotFound:

11592 LOG.warning('Extend volume failed, '

11593 'volume %(vol)s is not attached to instance.',

11594 {'vol': extended_volume_id},

11595 instance=instance)

11596 return

11597

11598 LOG.info('Cinder extended volume %(vol)s; '

11599 'extending it to detect new size',

11600 {'vol': extended_volume_id},

11601 instance=instance)

11602 volume = self.volume_api.get(context, bdm.volume_id)

11603

11604 if bdm.connection_info is None: 11604 ↛ 11605line 11604 didn't jump to line 11605 because the condition on line 11604 was never true

11605 LOG.warning('Extend volume failed, '

11606 'attached volume %(vol)s has no connection_info',

11607 {'vol': extended_volume_id},

11608 instance=instance)

11609 return

11610

11611 connection_info = jsonutils.loads(bdm.connection_info)

11612 bdm.volume_size = volume['size']

11613 bdm.save()

11614

11615 if not self.driver.capabilities.get('supports_extend_volume', False):

11616 raise exception.ExtendVolumeNotSupported()

11617

11618 try:

11619 self.driver.extend_volume(context, connection_info, instance,

11620 bdm.volume_size * units.Gi)

11621 except Exception as ex:

11622 LOG.warning('Extend volume failed, '

11623 'volume_id=%(volume_id)s, reason: %(msg)s',

11624 {'volume_id': extended_volume_id, 'msg': ex},

11625 instance=instance)

11626 raise

11627

11628 @staticmethod

11629 def _is_state_valid_for_power_update_event(instance, target_power_state):

11630 """Check if the current state of the instance allows it to be

11631 a candidate for the power-update event.

11632

11633 :param instance: The nova instance object.

11634 :param target_power_state: The desired target power state; this should

11635 either be "POWER_ON" or "POWER_OFF".

11636 :returns Boolean: True if the instance can be subjected to the

11637 power-update event.

11638 """

11639 if ((target_power_state == external_event_obj.POWER_ON and

11640 instance.task_state is None and

11641 instance.vm_state == vm_states.STOPPED and

11642 instance.power_state == power_state.SHUTDOWN) or

11643 (target_power_state == external_event_obj.POWER_OFF and

11644 instance.task_state is None and

11645 instance.vm_state == vm_states.ACTIVE and

11646 instance.power_state == power_state.RUNNING)):

11647 return True

11648 return False

11649

11650 @wrap_exception()

11651 @reverts_task_state

11652 @wrap_instance_event(prefix='compute')

11653 @wrap_instance_fault

11654 def power_update(self, context, instance, target_power_state):

11655 """Power update of an instance prompted by an external event.

11656 :param context: The API request context.

11657 :param instance: The nova instance object.

11658 :param target_power_state: The desired target power state;

11659 this should either be "POWER_ON" or

11660 "POWER_OFF".

11661 """

11662

11663 @utils.synchronized(instance.uuid)

11664 def do_power_update():

11665 LOG.debug('Handling power-update event with target_power_state %s '

11666 'for instance', target_power_state, instance=instance)

11667 if not self._is_state_valid_for_power_update_event(

11668 instance, target_power_state):

11669 pow_state = fields.InstancePowerState.from_index(

11670 instance.power_state)

11671 LOG.info('The power-update %(tag)s event for instance '

11672 '%(uuid)s is a no-op since the instance is in '

11673 'vm_state %(vm_state)s, task_state '

11674 '%(task_state)s and power_state '

11675 '%(power_state)s.',

11676 {'tag': target_power_state, 'uuid': instance.uuid,

11677 'vm_state': instance.vm_state,

11678 'task_state': instance.task_state,

11679 'power_state': pow_state})

11680 return

11681 LOG.debug("Trying to %s instance",

11682 target_power_state, instance=instance)

11683 if target_power_state == external_event_obj.POWER_ON:

11684 action = fields.NotificationAction.POWER_ON

11685 notification_name = "power_on."

11686 instance.task_state = task_states.POWERING_ON

11687 else:

11688 # It's POWER_OFF

11689 action = fields.NotificationAction.POWER_OFF

11690 notification_name = "power_off."

11691 instance.task_state = task_states.POWERING_OFF

11692 instance.progress = 0

11693

11694 try:

11695 # Note that the task_state is set here rather than the API

11696 # because this is a best effort operation and deferring

11697 # updating the task_state until we get to the compute service

11698 # avoids error handling in the API and needing to account for

11699 # older compute services during rolling upgrades from Stein.

11700 # If we lose a race, UnexpectedTaskStateError is handled

11701 # below.

11702 instance.save(expected_task_state=[None])

11703 self._notify_about_instance_usage(context, instance,

11704 notification_name + "start")

11705 compute_utils.notify_about_instance_action(context, instance,

11706 self.host, action=action,

11707 phase=fields.NotificationPhase.START)

11708 # UnexpectedTaskStateError raised from the driver will be

11709 # handled below and not result in a fault, error notification

11710 # or failure of the instance action. Other driver errors like

11711 # NotImplementedError will be record a fault, send an error

11712 # notification and mark the instance action as failed.

11713 self.driver.power_update_event(instance, target_power_state)

11714 self._notify_about_instance_usage(context, instance,

11715 notification_name + "end")

11716 compute_utils.notify_about_instance_action(context, instance,

11717 self.host, action=action,

11718 phase=fields.NotificationPhase.END)

11719 except exception.UnexpectedTaskStateError as e:

11720 # Handling the power-update event is best effort and if we lost

11721 # a race with some other action happening to the instance we

11722 # just log it and return rather than fail the action.

11723 LOG.info("The power-update event was possibly preempted: %s ",

11724 e.format_message(), instance=instance)

11725 return

11726 do_power_update()

11727

11728 @wrap_exception()

11729 def external_instance_event(self, context, instances, events):

11730 # NOTE(danms): Some event types are handled by the manager, such

11731 # as when we're asked to update the instance's info_cache. If it's

11732 # not one of those, look for some thread(s) waiting for the event and

11733 # unblock them if so.

11734 for event in events:

11735 instance = [inst for inst in instances

11736 if inst.uuid == event.instance_uuid][0]

11737 LOG.debug('Received event %(event)s',

11738 {'event': event.key},

11739 instance=instance)

11740 if event.name == 'network-changed':

11741 try:

11742 LOG.debug('Refreshing instance network info cache due to '

11743 'event %s.', event.key, instance=instance)

11744 self.network_api.get_instance_nw_info(

11745 context, instance, refresh_vif_id=event.tag)

11746 except exception.NotFound as e:

11747 LOG.info('Failed to process external instance event '

11748 '%(event)s due to: %(error)s',

11749 {'event': event.key, 'error': str(e)},

11750 instance=instance)

11751 elif event.name == 'network-vif-deleted':

11752 try:

11753 self._process_instance_vif_deleted_event(context,

11754 instance,

11755 event.tag)

11756 except exception.NotFound as e:

11757 LOG.info('Failed to process external instance event '

11758 '%(event)s due to: %(error)s',

11759 {'event': event.key, 'error': str(e)},

11760 instance=instance)

11761 elif event.name == 'volume-extended':

11762 self.extend_volume(context, instance, event.tag)

11763 elif event.name == 'power-update':

11764 self.power_update(context, instance, event.tag)

11765 else:

11766 self._process_instance_event(instance, event)

11767

11768 @periodic_task.periodic_task(spacing=CONF.image_cache.manager_interval,

11769 external_process_ok=True)

11770 def _run_image_cache_manager_pass(self, context):

11771 """Run a single pass of the image cache manager."""

11772

11773 if not self.driver.capabilities.get("has_imagecache", False): 11773 ↛ 11774line 11773 didn't jump to line 11774 because the condition on line 11773 was never true

11774 return

11775

11776 # Determine what other nodes use this storage

11777 storage_users.register_storage_use(CONF.instances_path, CONF.host)

11778 nodes = storage_users.get_storage_users(CONF.instances_path)

11779

11780 # Filter all_instances to only include those nodes which share this

11781 # storage path.

11782 # TODO(mikal): this should be further refactored so that the cache

11783 # cleanup code doesn't know what those instances are, just a remote

11784 # count, and then this logic should be pushed up the stack.

11785 filters = {'deleted': False,

11786 'soft_deleted': True,

11787 'host': nodes}

11788 filtered_instances = objects.InstanceList.get_by_filters(context,

11789 filters, expected_attrs=[], use_slave=True)

11790

11791 self.driver.manage_image_cache(context, filtered_instances)

11792

11793 def cache_images(self, context, image_ids):

11794 """Ask the virt driver to pre-cache a set of base images.

11795

11796 :param context: The RequestContext

11797 :param image_ids: The image IDs to be cached

11798 :return: A dict, keyed by image-id where the values are one of:

11799 'cached' if the image was downloaded,

11800 'existing' if the image was already in the cache,

11801 'unsupported' if the virt driver does not support caching,

11802 'error' if the virt driver raised an exception.

11803 """

11804

11805 results = {}

11806

11807 LOG.info('Caching %i image(s) by request', len(image_ids))

11808 for image_id in image_ids:

11809 try:

11810 cached = self.driver.cache_image(context, image_id)

11811 if cached:

11812 results[image_id] = 'cached'

11813 else:

11814 results[image_id] = 'existing'

11815 except NotImplementedError:

11816 LOG.warning('Virt driver does not support image pre-caching;'

11817 ' ignoring request')

11818 # NOTE(danms): Yes, technically we could short-circuit here to

11819 # avoid trying the rest of the images, but it's very cheap to

11820 # just keep hitting the NotImplementedError to keep the logic

11821 # clean.

11822 results[image_id] = 'unsupported'

11823 except Exception as e:

11824 results[image_id] = 'error'

11825 LOG.error('Failed to cache image %(image_id)s: %(err)s',

11826 {'image_id': image_id,

11827 'err': e})

11828

11829 return results

11830

11831 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)

11832 def _run_pending_deletes(self, context):

11833 """Retry any pending instance file deletes."""

11834 LOG.debug('Cleaning up deleted instances')

11835 filters = {'deleted': True,

11836 'soft_deleted': False,

11837 'host': CONF.host,

11838 'cleaned': False}

11839 attrs = ['system_metadata']

11840 with utils.temporary_mutation(context, read_deleted='yes'):

11841 instances = objects.InstanceList.get_by_filters(

11842 context, filters, expected_attrs=attrs, use_slave=True)

11843 LOG.debug('There are %d instances to clean', len(instances))

11844

11845 for instance in instances:

11846 attempts = int(instance.system_metadata.get('clean_attempts', '0'))

11847 LOG.debug('Instance has had %(attempts)s of %(max)s '

11848 'cleanup attempts',

11849 {'attempts': attempts,

11850 'max': CONF.maximum_instance_delete_attempts},

11851 instance=instance)

11852 if attempts < CONF.maximum_instance_delete_attempts:

11853 success = self.driver.delete_instance_files(instance)

11854

11855 instance.system_metadata['clean_attempts'] = str(attempts + 1)

11856 if success:

11857 instance.cleaned = True

11858 with utils.temporary_mutation(context, read_deleted='yes'):

11859 instance.save()

11860

11861 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)

11862 def _cleanup_incomplete_migrations(self, context):

11863 """Cleanup on failed resize/revert-resize operation and

11864 failed rollback live migration operation.

11865

11866 During resize/revert-resize operation, or after a failed rollback

11867 live migration operation, if that instance gets deleted then instance

11868 files might remain either on source or destination compute node and

11869 other specific resources might not be cleaned up because of the race

11870 condition.

11871 """

11872 LOG.debug('Cleaning up deleted instances with incomplete migration ')

11873 migration_filters = {'host': CONF.host,

11874 'status': 'error'}

11875 migrations = objects.MigrationList.get_by_filters(context,

11876 migration_filters)

11877

11878 if not migrations: 11878 ↛ 11879line 11878 didn't jump to line 11879 because the condition on line 11878 was never true

11879 return

11880

11881 inst_uuid_from_migrations = set([migration.instance_uuid for migration

11882 in migrations])

11883

11884 inst_filters = {'deleted': True, 'soft_deleted': False,

11885 'uuid': inst_uuid_from_migrations}

11886 attrs = ['info_cache', 'security_groups', 'system_metadata']

11887 with utils.temporary_mutation(context, read_deleted='yes'):

11888 instances = objects.InstanceList.get_by_filters(

11889 context, inst_filters, expected_attrs=attrs, use_slave=True)

11890

11891 for instance in instances:

11892 if instance.host == CONF.host: 11892 ↛ 11893line 11892 didn't jump to line 11893 because the condition on line 11892 was never true

11893 continue

11894 for migration in migrations: 11894 ↛ 11891line 11894 didn't jump to line 11891 because the loop on line 11894 didn't complete

11895 if instance.uuid != migration.instance_uuid:

11896 continue

11897 self.driver.delete_instance_files(instance)

11898 # we are not sure whether the migration_context is applied

11899 # during incompleted migrating, we need to apply/revert

11900 # migration_context to get instance object content matching

11901 # current host.

11902 revert = (True if migration.source_compute == CONF.host

11903 else False)

11904 with instance.mutated_migration_context(revert=revert):

11905 self.driver.cleanup_lingering_instance_resources(instance)

11906

11907 try:

11908 migration.status = 'failed'

11909 migration.save()

11910 except exception.MigrationNotFound:

11911 LOG.warning("Migration %s is not found.",

11912 migration.id,

11913 instance=instance)

11914 break

11915

11916 @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,

11917 exception.QemuGuestAgentNotEnabled,

11918 exception.NovaException,

11919 NotImplementedError)

11920 @wrap_exception()

11921 def quiesce_instance(self, context, instance):

11922 """Quiesce an instance on this host."""

11923 context = context.elevated()

11924 image_meta = objects.ImageMeta.from_instance(instance)

11925 self.driver.quiesce(context, instance, image_meta)

11926

11927 def _wait_for_snapshots_completion(self, context, mapping):

11928 for mapping_dict in mapping:

11929 if mapping_dict.get('source_type') == 'snapshot': 11929 ↛ 11928line 11929 didn't jump to line 11928 because the condition on line 11929 was always true

11930

11931 def _wait_snapshot():

11932 snapshot = self.volume_api.get_snapshot(

11933 context, mapping_dict['snapshot_id'])

11934 if snapshot.get('status') != 'creating':

11935 raise loopingcall.LoopingCallDone()

11936

11937 timer = loopingcall.FixedIntervalLoopingCall(_wait_snapshot)

11938 timer.start(interval=0.5).wait()

11939

11940 @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported,

11941 exception.QemuGuestAgentNotEnabled,

11942 exception.NovaException,

11943 NotImplementedError)

11944 @wrap_exception()

11945 def unquiesce_instance(self, context, instance, mapping=None):

11946 """Unquiesce an instance on this host.

11947

11948 If snapshots' image mapping is provided, it waits until snapshots are

11949 completed before unqueiscing.

11950 """

11951 context = context.elevated()

11952 if mapping: 11952 ↛ 11959line 11952 didn't jump to line 11959 because the condition on line 11952 was always true

11953 try:

11954 self._wait_for_snapshots_completion(context, mapping)

11955 except Exception as error:

11956 LOG.exception("Exception while waiting completion of "

11957 "volume snapshots: %s",

11958 error, instance=instance)

11959 image_meta = objects.ImageMeta.from_instance(instance)

11960 self.driver.unquiesce(context, instance, image_meta)

11961

11962 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval)

11963 def _cleanup_expired_console_auth_tokens(self, context):

11964 """Remove all expired console auth tokens.

11965

11966 Console authorization tokens and their connection data are stored

11967 in the database when a user asks for a console connection to an

11968 instance. After a time they expire. We periodically remove any expired

11969 tokens from the database.

11970 """

11971 objects.ConsoleAuthToken.clean_expired_console_auths(context)

11972

11973 def _claim_pci_for_instance_vifs(self, ctxt, instance):

11974 """Claim PCI devices for the instance's VIFs on the compute node

11975

11976 :param ctxt: Context

11977 :param instance: Instance object

11978 :return: <port ID: PciDevice> mapping for the VIFs that yielded a

11979 PCI claim on the compute node

11980 """

11981 pci_req_id_to_port_id = {}

11982 pci_reqs = []

11983 port_id_to_pci_dev = {}

11984

11985 for vif in instance.get_network_info():

11986 pci_req = pci_req_module.get_instance_pci_request_from_vif(

11987 ctxt,

11988 instance,

11989 vif)

11990 if pci_req:

11991 pci_req_id_to_port_id[pci_req.request_id] = vif['id']

11992 pci_reqs.append(pci_req)

11993

11994 if pci_reqs:

11995 # Create PCI requests and claim against PCI resource tracker

11996 # NOTE(adrianc): We claim against the same requests as on the

11997 # source node.

11998 vif_pci_requests = objects.InstancePCIRequests(

11999 requests=pci_reqs,

12000 instance_uuid=instance.uuid)

12001

12002 # if we are called during the live migration with NUMA topology

12003 # support the PCI claim needs to consider the destination NUMA

12004 # topology that is then stored in the migration_context

12005 dest_topo = None

12006 if instance.migration_context: 12006 ↛ 12009line 12006 didn't jump to line 12009 because the condition on line 12006 was always true

12007 dest_topo = instance.migration_context.new_numa_topology

12008

12009 claimed_pci_devices_objs = self.rt.claim_pci_devices(

12010 ctxt, vif_pci_requests, dest_topo)

12011

12012 # Update VIFMigrateData profile with the newly claimed PCI

12013 # device

12014 for pci_dev in claimed_pci_devices_objs:

12015 LOG.debug("PCI device: %s Claimed on destination node",

12016 pci_dev.address)

12017 port_id = pci_req_id_to_port_id[pci_dev.request_id]

12018 port_id_to_pci_dev[port_id] = pci_dev

12019

12020 return port_id_to_pci_dev

12021

12022 def _update_migrate_vifs_profile_with_pci(self,

12023 migrate_vifs,

12024 port_id_to_pci_dev):

12025 """Update migrate vifs profile with the claimed PCI devices

12026

12027 :param migrate_vifs: list of VIFMigrateData objects

12028 :param port_id_to_pci_dev: a <port_id: PciDevice> mapping

12029 :return: None.

12030 """

12031 for mig_vif in migrate_vifs:

12032 port_id = mig_vif.port_id

12033 if port_id not in port_id_to_pci_dev:

12034 continue

12035

12036 pci_dev = port_id_to_pci_dev[port_id]

12037 profile = copy.deepcopy(mig_vif.source_vif['profile'])

12038 profile['pci_slot'] = pci_dev.address

12039 profile['pci_vendor_info'] = ':'.join([pci_dev.vendor_id,

12040 pci_dev.product_id])

12041 if profile.get('card_serial_number'): 12041 ↛ 12044line 12041 didn't jump to line 12044 because the condition on line 12041 was never true

12042 # Assume it is there since Nova makes sure that PCI devices

12043 # tagged as remote-managed have a serial in PCI VPD.

12044 profile['card_serial_number'] = pci_dev.card_serial_number

12045 if profile.get('pf_mac_address'): 12045 ↛ 12046line 12045 didn't jump to line 12046 because the condition on line 12045 was never true

12046 profile['pf_mac_address'] = pci_dev.sriov_cap['pf_mac_address']

12047 if profile.get('vf_num'): 12047 ↛ 12048line 12047 didn't jump to line 12048 because the condition on line 12047 was never true

12048 profile['vf_num'] = pci_dev.sriov_cap['vf_num']

12049

12050 if pci_dev.mac_address:

12051 profile['device_mac_address'] = pci_dev.mac_address

12052

12053 mig_vif.profile = profile

12054 LOG.debug("Updating migrate VIF profile for port %(port_id)s:"

12055 "%(profile)s", {'port_id': port_id,

12056 'profile': profile})

12057

12058

12059# TODO(sbauza): Remove this proxy class in the X release once we drop the 5.x

12060# support.

12061# NOTE(sbauza): This proxy class will support the existing <=5.13 RPC calls

12062# from any RPC client but will also make sure that the new 6.0 RPC calls will

12063# be supported.

12064class _ComputeV5Proxy(object):

12065

12066 target = messaging.Target(version='5.13')

12067

12068 def __init__(self, manager):

12069 self.manager = manager

12070

12071 def __getattr__(self, name):

12072 # NOTE(sbauza): Proxying all the other methods but the V5 ones.

12073 return getattr(self.manager, name)

12074

12075 # 5.0 support for block_migration argument

12076 def pre_live_migration(self, context, instance, block_migration, disk,

12077 migrate_data):

12078 return self.manager.pre_live_migration(context, instance, disk,

12079 migrate_data)

12080

12081 # 5.1 support for legacy request_spec argument

12082 def prep_resize(self, context, image, instance, instance_type,

12083 request_spec, filter_properties, node,

12084 clean_shutdown, migration, host_list):

12085 if not isinstance(request_spec, objects.RequestSpec):

12086 # Prior to compute RPC API 5.1 conductor would pass a legacy dict

12087 # version of the request spec to compute and since Stein compute

12088 # could be sending that back to conductor on reschedule, so if we

12089 # got a dict convert it to an object.

12090 # TODO(mriedem): We can drop this compat code when we only support

12091 # compute RPC API >=6.0.

12092 request_spec = objects.RequestSpec.from_primitives(

12093 context, request_spec, filter_properties)

12094 # We don't have to set the new flavor on the request spec because

12095 # if we got here it was due to a reschedule from the compute and

12096 # the request spec would already have the new flavor in it from the

12097 # else block below.

12098 self.manager.prep_resize(context, image, instance, instance_type,

12099 request_spec, filter_properties, node,

12100 clean_shutdown, migration, host_list)

12101

12102 # 5.2 support for optional request_spec argument

12103 def resize_instance(self, context, instance, image,

12104 migration, instance_type, clean_shutdown,

12105 request_spec=None):

12106 self.manager.resize_instance(context, instance, image,

12107 migration, instance_type, clean_shutdown,

12108 request_spec)

12109

12110 # 5.2 support for optional request_spec argument

12111 def finish_resize(self, context, disk_info, image, instance,

12112 migration, request_spec=None):

12113 self.manager.finish_resize(context, disk_info, image, instance,

12114 migration, request_spec)

12115

12116 # 5.2 support for optional request_spec argument

12117 def revert_resize(self, context, instance, migration, request_spec=None):

12118 self.manager.revert_resize(context, instance, migration, request_spec)

12119

12120 # 5.2 support for optional request_spec argument

12121 def finish_revert_resize(

12122 self, context, instance, migration, request_spec=None):

12123 self.manager.finish_revert_resize(context, instance, migration,

12124 request_spec)

12125

12126 # 5.2 support for optional request_spec argument

12127 # 5.13 support for optional accel_uuids argument

12128 def unshelve_instance(self, context, instance, image, filter_properties,

12129 node, request_spec=None, accel_uuids=None):

12130 self.manager.unshelve_instance(context, instance, image,

12131 filter_properties, node, request_spec,

12132 accel_uuids or [])

12133

12134 # 5.3 support for optional migration and limits arguments

12135 def check_can_live_migrate_destination(self, ctxt, instance,

12136 block_migration, disk_over_commit,

12137 migration=None, limits=None):

12138 return self.manager.check_can_live_migrate_destination(

12139 ctxt, instance, block_migration, disk_over_commit,

12140 migration, limits)

12141

12142 # 5.11 support for optional accel_uuids argument

12143 def build_and_run_instance(self, context, instance, image, request_spec,

12144 filter_properties, admin_password=None,

12145 injected_files=None, requested_networks=None,

12146 security_groups=None, block_device_mapping=None,

12147 node=None, limits=None, host_list=None, accel_uuids=None):

12148 self.manager.build_and_run_instance(

12149 context, instance, image, request_spec,

12150 filter_properties, accel_uuids, admin_password,

12151 injected_files, requested_networks,

12152 security_groups, block_device_mapping,

12153 node, limits, host_list)

12154

12155 # 5.12 support for optional accel_uuids argument

12156 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,

12157 injected_files, new_pass, orig_sys_metadata,

12158 bdms, recreate, on_shared_storage,

12159 preserve_ephemeral, migration,

12160 scheduled_node, limits, request_spec,

12161 accel_uuids=None):

12162 self.manager.rebuild_instance(

12163 context, instance, orig_image_ref, image_ref,

12164 injected_files, new_pass, orig_sys_metadata,

12165 bdms, recreate, on_shared_storage,

12166 preserve_ephemeral, migration,

12167 scheduled_node, limits, request_spec,

12168 accel_uuids, False, None)

12169

12170 # 5.13 support for optional accel_uuids argument

12171 def shelve_instance(self, context, instance, image_id,

12172 clean_shutdown, accel_uuids=None):

12173 self.manager.shelve_instance(context, instance, image_id,

12174 clean_shutdown, accel_uuids)

12175

12176 # 5.13 support for optional accel_uuids argument

12177 def shelve_offload_instance(self, context, instance, clean_shutdown,

12178 accel_uuids=None):

12179 self.manager.shelve_offload_instance(

12180 context, instance, clean_shutdown, accel_uuids)

12181

12182 # 6.0 drop unused request_spec argument

12183 def prep_snapshot_based_resize_at_dest(

12184 self, ctxt, instance, flavor, nodename, migration, limits,

12185 request_spec):

12186 return self.manager.prep_snapshot_based_resize_at_dest(

12187 ctxt, instance, flavor, nodename, migration, limits)

12188

12189 # 6.0 drop unused request_spec argument

12190 def finish_snapshot_based_resize_at_dest(

12191 self, ctxt, instance, migration, snapshot_id, request_spec):

12192 self.manager.finish_snapshot_based_resize_at_dest(

12193 ctxt, instance, migration, snapshot_id)

12194

12195 # 6.0 drop unused instance argument

12196 def check_instance_shared_storage(self, ctxt, instance, data):

12197 return self.manager.check_instance_shared_storage(ctxt, data)