Coverage for nova/compute/manager.py: 93%

4979 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-17 15:08 +0000

1# Copyright 2010 United States Government as represented by the 

2# Administrator of the National Aeronautics and Space Administration. 

3# Copyright 2011 Justin Santa Barbara 

4# All Rights Reserved. 

5# 

6# Licensed under the Apache License, Version 2.0 (the "License"); you may 

7# not use this file except in compliance with the License. You may obtain 

8# a copy of the License at 

9# 

10# http://www.apache.org/licenses/LICENSE-2.0 

11# 

12# Unless required by applicable law or agreed to in writing, software 

13# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

14# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

15# License for the specific language governing permissions and limitations 

16# under the License. 

17 

18"""Handles all processes relating to instances (guest vms). 

19 

20The :py:class:`ComputeManager` class is a :py:class:`nova.manager.Manager` that 

21handles RPC calls relating to creating instances. It is responsible for 

22building a disk image, launching it via the underlying virtualization driver, 

23responding to calls to check its state, attaching persistent storage, and 

24terminating it. 

25 

26""" 

27 

28import base64 

29import binascii 

30import contextlib 

31import copy 

32import functools 

33import inspect 

34import math 

35import sys 

36import time 

37import traceback 

38import typing as ty 

39 

40from cinderclient import exceptions as cinder_exception 

41from cursive import exception as cursive_exception 

42import eventlet.event 

43from eventlet import greenthread 

44import eventlet.semaphore 

45import eventlet.timeout 

46import futurist 

47from keystoneauth1 import exceptions as keystone_exception 

48from openstack import exceptions as sdk_exc 

49import os_traits 

50from oslo_log import log as logging 

51import oslo_messaging as messaging 

52from oslo_serialization import jsonutils 

53from oslo_service import loopingcall 

54from oslo_service import periodic_task 

55from oslo_utils import excutils 

56from oslo_utils import strutils 

57from oslo_utils import timeutils 

58from oslo_utils import units 

59 

60from nova.accelerator import cyborg 

61from nova import block_device 

62from nova.compute import api as compute 

63from nova.compute import build_results 

64from nova.compute import claims 

65from nova.compute import power_state 

66from nova.compute import resource_tracker 

67from nova.compute import rpcapi as compute_rpcapi 

68from nova.compute import task_states 

69from nova.compute import utils as compute_utils 

70from nova.compute.utils import wrap_instance_event 

71from nova.compute import vm_states 

72from nova import conductor 

73import nova.conf 

74import nova.context 

75from nova import exception 

76from nova import exception_wrapper 

77from nova.i18n import _ 

78from nova.image import glance 

79from nova import manager 

80from nova.network import model as network_model 

81from nova.network import neutron 

82from nova import objects 

83from nova.objects import base as obj_base 

84from nova.objects import external_event as external_event_obj 

85from nova.objects import fields 

86from nova.objects import instance as obj_instance 

87from nova.objects import migrate_data as migrate_data_obj 

88from nova.objects import service as service_obj 

89from nova.pci import request as pci_req_module 

90from nova.pci import whitelist 

91from nova import safe_utils 

92from nova.scheduler.client import query 

93from nova.scheduler.client import report 

94from nova.scheduler import utils as scheduler_utils 

95from nova.share import manila 

96from nova import utils 

97from nova.virt import block_device as driver_block_device 

98from nova.virt import configdrive 

99from nova.virt import driver 

100from nova.virt import event as virtevent 

101from nova.virt import hardware 

102import nova.virt.node 

103from nova.virt import storage_users 

104from nova.virt import virtapi 

105from nova.volume import cinder 

106 

107CONF = nova.conf.CONF 

108 

109LOG = logging.getLogger(__name__) 

110 

111wrap_exception = functools.partial( 

112 exception_wrapper.wrap_exception, service='compute', binary='nova-compute') 

113 

114 

115@contextlib.contextmanager 

116def errors_out_migration_ctxt(migration): 

117 """Context manager to error out migration on failure.""" 

118 

119 try: 

120 yield 

121 except Exception: 

122 with excutils.save_and_reraise_exception(): 

123 if migration: 

124 # We may have been passed None for our migration if we're 

125 # receiving from an older client. The migration will be 

126 # errored via the legacy path. 

127 migration.status = 'error' 

128 try: 

129 migration.save() 

130 except Exception: 

131 LOG.debug( 

132 'Error setting migration status for instance %s.', 

133 migration.instance_uuid, exc_info=True) 

134 

135 

136@utils.expects_func_args('migration') 

137def errors_out_migration(function): 

138 """Decorator to error out migration on failure.""" 

139 

140 @functools.wraps(function) 

141 def decorated_function(self, context, *args, **kwargs): 

142 wrapped_func = safe_utils.get_wrapped_function(function) 

143 keyed_args = inspect.getcallargs(wrapped_func, self, context, 

144 *args, **kwargs) 

145 migration = keyed_args['migration'] 

146 with errors_out_migration_ctxt(migration): 

147 return function(self, context, *args, **kwargs) 

148 

149 return decorated_function 

150 

151 

152@utils.expects_func_args('instance') 

153def reverts_task_state(function): 

154 """Decorator to revert task_state on failure.""" 

155 

156 @functools.wraps(function) 

157 def decorated_function(self, context, *args, **kwargs): 

158 try: 

159 return function(self, context, *args, **kwargs) 

160 except exception.UnexpectedTaskStateError as e: 

161 # Note(maoy): unexpected task state means the current 

162 # task is preempted. Do not clear task state in this 

163 # case. 

164 with excutils.save_and_reraise_exception(): 

165 LOG.info("Task possibly preempted: %s", 

166 e.format_message()) 

167 except Exception: 

168 with excutils.save_and_reraise_exception(): 

169 wrapped_func = safe_utils.get_wrapped_function(function) 

170 keyed_args = inspect.getcallargs(wrapped_func, self, context, 

171 *args, **kwargs) 

172 # NOTE(mriedem): 'instance' must be in keyed_args because we 

173 # have utils.expects_func_args('instance') decorating this 

174 # method. 

175 instance = keyed_args['instance'] 

176 original_task_state = instance.task_state 

177 try: 

178 self._instance_update(context, instance, task_state=None) 

179 LOG.info("Successfully reverted task state from %s on " 

180 "failure for instance.", 

181 original_task_state, instance=instance) 

182 except exception.InstanceNotFound: 

183 # We might delete an instance that failed to build shortly 

184 # after it errored out this is an expected case and we 

185 # should not trace on it. 

186 pass 

187 except Exception as e: 

188 LOG.warning("Failed to revert task state for instance. " 

189 "Error: %s", e, instance=instance) 

190 

191 return decorated_function 

192 

193 

194@utils.expects_func_args('instance') 

195def wrap_instance_fault(function): 

196 """Wraps a method to catch exceptions related to instances. 

197 

198 This decorator wraps a method to catch any exceptions having to do with 

199 an instance that may get thrown. It then logs an instance fault in the db. 

200 """ 

201 

202 @functools.wraps(function) 

203 def decorated_function(self, context, *args, **kwargs): 

204 try: 

205 return function(self, context, *args, **kwargs) 

206 except exception.InstanceNotFound: 

207 raise 

208 except Exception as e: 

209 # NOTE(gtt): If argument 'instance' is in args rather than kwargs, 

210 # we will get a KeyError exception which will cover up the real 

211 # exception. So, we update kwargs with the values from args first. 

212 # then, we can get 'instance' from kwargs easily. 

213 kwargs.update(dict(zip(function.__code__.co_varnames[2:], args))) 

214 

215 with excutils.save_and_reraise_exception(): 

216 compute_utils.add_instance_fault_from_exc(context, 

217 kwargs['instance'], e, sys.exc_info()) 

218 

219 return decorated_function 

220 

221 

222@utils.expects_func_args('image_id', 'instance') 

223def delete_image_on_error(function): 

224 """Used for snapshot related method to ensure the image created in 

225 compute.api is deleted when an error occurs. 

226 """ 

227 

228 @functools.wraps(function) 

229 def decorated_function(self, context, image_id, instance, 

230 *args, **kwargs): 

231 try: 

232 return function(self, context, image_id, instance, 

233 *args, **kwargs) 

234 except Exception: 

235 with excutils.save_and_reraise_exception(): 

236 compute_utils.delete_image( 

237 context, instance, self.image_api, image_id, 

238 log_exc_info=True) 

239 

240 return decorated_function 

241 

242 

243# Each collection of events is a dict of eventlet Events keyed by a tuple of 

244# event name and associated tag 

245_InstanceEvents = ty.Dict[ty.Tuple[str, str], eventlet.event.Event] 

246 

247 

248class InstanceEvents(object): 

249 def __init__(self): 

250 self._events: ty.Optional[ty.Dict[str, _InstanceEvents]] = {} 

251 

252 @staticmethod 

253 def _lock_name(instance) -> str: 

254 return '%s-%s' % (instance.uuid, 'events') 

255 

256 def prepare_for_instance_event( 

257 self, 

258 instance: 'objects.Instance', 

259 name: str, 

260 tag: str, 

261 ) -> eventlet.event.Event: 

262 """Prepare to receive an event for an instance. 

263 

264 This will register an event for the given instance that we will 

265 wait on later. This should be called before initiating whatever 

266 action will trigger the event. The resulting eventlet.event.Event 

267 object should be wait()'d on to ensure completion. 

268 

269 :param instance: the instance for which the event will be generated 

270 :param name: the name of the event we're expecting 

271 :param tag: the tag associated with the event we're expecting 

272 :returns: an event object that should be wait()'d on 

273 """ 

274 @utils.synchronized(self._lock_name(instance)) 

275 def _create_or_get_event(): 

276 if self._events is None: 

277 # NOTE(danms): We really should have a more specific error 

278 # here, but this is what we use for our default error case 

279 raise exception.NovaException( 

280 'In shutdown, no new events can be scheduled') 

281 

282 instance_events = self._events.setdefault(instance.uuid, {}) 

283 return instance_events.setdefault((name, tag), 

284 eventlet.event.Event()) 

285 LOG.debug('Preparing to wait for external event %(name)s-%(tag)s', 

286 {'name': name, 'tag': tag}, instance=instance) 

287 return _create_or_get_event() 

288 

289 def pop_instance_event(self, instance, event): 

290 """Remove a pending event from the wait list. 

291 

292 This will remove a pending event from the wait list so that it 

293 can be used to signal the waiters to wake up. 

294 

295 :param instance: the instance for which the event was generated 

296 :param event: the nova.objects.external_event.InstanceExternalEvent 

297 that describes the event 

298 :returns: the eventlet.event.Event object on which the waiters 

299 are blocked 

300 """ 

301 no_events_sentinel = object() 

302 no_matching_event_sentinel = object() 

303 

304 @utils.synchronized(self._lock_name(instance)) 

305 def _pop_event(): 

306 if self._events is None: 

307 LOG.debug('Unexpected attempt to pop events during shutdown', 

308 instance=instance) 

309 return no_events_sentinel 

310 events = self._events.get(instance.uuid) 

311 if not events: 311 ↛ 312line 311 didn't jump to line 312 because the condition on line 311 was never true

312 return no_events_sentinel 

313 _event = events.pop((event.name, event.tag), None) 

314 if not events: 314 ↛ 316line 314 didn't jump to line 316 because the condition on line 314 was always true

315 del self._events[instance.uuid] 

316 if _event is None: 316 ↛ 317line 316 didn't jump to line 317 because the condition on line 316 was never true

317 return no_matching_event_sentinel 

318 return _event 

319 

320 result = _pop_event() 

321 if result is no_events_sentinel: 

322 LOG.debug('No waiting events found dispatching %(event)s', 

323 {'event': event.key}, 

324 instance=instance) 

325 return None 

326 elif result is no_matching_event_sentinel: 326 ↛ 327line 326 didn't jump to line 327 because the condition on line 326 was never true

327 LOG.debug( 

328 'No event matching %(event)s in %(events)s', 

329 { 

330 'event': event.key, 

331 # mypy can't identify the none check in _pop_event 

332 'events': self._events.get( # type: ignore 

333 instance.uuid, {}).keys(), 

334 }, 

335 instance=instance, 

336 ) 

337 return None 

338 else: 

339 return result 

340 

341 def clear_events_for_instance(self, instance): 

342 """Remove all pending events for an instance. 

343 

344 This will remove all events currently pending for an instance 

345 and return them (indexed by event name). 

346 

347 :param instance: the instance for which events should be purged 

348 :returns: a dictionary of {event_name: eventlet.event.Event} 

349 """ 

350 @utils.synchronized(self._lock_name(instance)) 

351 def _clear_events(): 

352 if self._events is None: 

353 LOG.debug('Unexpected attempt to clear events during shutdown', 

354 instance=instance) 

355 return dict() 

356 # NOTE(danms): We have historically returned the raw internal 

357 # format here, which is {event.key: [events, ...])} so just 

358 # trivially convert it here. 

359 return {'%s-%s' % k: e 

360 for k, e in self._events.pop(instance.uuid, {}).items()} 

361 return _clear_events() 

362 

363 def cancel_all_events(self): 

364 if self._events is None: 

365 LOG.debug('Unexpected attempt to cancel events during shutdown.') 

366 return 

367 our_events = self._events 

368 # NOTE(danms): Block new events 

369 self._events = None 

370 

371 for instance_uuid, events in our_events.items(): 

372 for (name, tag), eventlet_event in events.items(): 

373 LOG.debug('Canceling in-flight event %(name)s-%(tag)s for ' 

374 'instance %(instance_uuid)s', 

375 {'name': name, 

376 'tag': tag, 

377 'instance_uuid': instance_uuid}) 

378 event = objects.InstanceExternalEvent( 

379 instance_uuid=instance_uuid, 

380 name=name, status='failed', 

381 tag=tag, data={}) 

382 eventlet_event.send(event) 

383 

384 

385class ComputeVirtAPI(virtapi.VirtAPI): 

386 def __init__(self, compute): 

387 super(ComputeVirtAPI, self).__init__() 

388 self._compute = compute 

389 self.reportclient = compute.reportclient 

390 

391 class ExitEarly(Exception): 

392 def __init__(self, events): 

393 super(Exception, self).__init__() 

394 self.events = events 

395 

396 self._exit_early_exc = ExitEarly 

397 

398 def exit_wait_early(self, events): 

399 """Exit a wait_for_instance_event() immediately and avoid 

400 waiting for some events. 

401 

402 :param: events: A list of (name, tag) tuples for events that we should 

403 skip waiting for during a wait_for_instance_event(). 

404 """ 

405 raise self._exit_early_exc(events=events) 

406 

407 def _default_error_callback(self, event_name, instance): 

408 raise exception.NovaException(_('Instance event failed')) 

409 

410 class _InstanceEvent: 

411 EXPECTED = "expected" 

412 WAITING = "waiting" 

413 RECEIVED = "received" 

414 RECEIVED_EARLY = "received early" 

415 TIMED_OUT = "timed out" 

416 RECEIVED_NOT_PROCESSED = "received but not processed" 

417 

418 def __init__(self, name: str, event: eventlet.event.Event) -> None: 

419 self.name = name 

420 self.event = event 

421 self.status = self.EXPECTED 

422 self.wait_time = None 

423 

424 def mark_as_received_early(self) -> None: 

425 self.status = self.RECEIVED_EARLY 

426 

427 def is_received_early(self) -> bool: 

428 return self.status == self.RECEIVED_EARLY 

429 

430 def _update_status_no_wait(self): 

431 if self.status == self.EXPECTED and self.event.ready(): 

432 self.status = self.RECEIVED_NOT_PROCESSED 

433 

434 def wait(self) -> 'objects.InstanceExternalEvent': 

435 self.status = self.WAITING 

436 try: 

437 with timeutils.StopWatch() as sw: 

438 instance_event = self.event.wait() 

439 except eventlet.timeout.Timeout: 

440 self.status = self.TIMED_OUT 

441 self.wait_time = sw.elapsed() 

442 

443 raise 

444 

445 self.status = self.RECEIVED 

446 self.wait_time = sw.elapsed() 

447 return instance_event 

448 

449 def __str__(self) -> str: 

450 self._update_status_no_wait() 

451 if self.status == self.EXPECTED: 

452 return f"{self.name}: expected but not received" 

453 if self.status == self.RECEIVED: 

454 return ( 

455 f"{self.name}: received after waiting " 

456 f"{self.wait_time:.2f} seconds") 

457 if self.status == self.TIMED_OUT: 

458 return ( 

459 f"{self.name}: timed out after " 

460 f"{self.wait_time:.2f} seconds") 

461 return f"{self.name}: {self.status}" 

462 

463 @staticmethod 

464 def _wait_for_instance_events( 

465 instance: 'objects.Instance', 

466 events: dict, 

467 error_callback: ty.Callable, 

468 ) -> None: 

469 for event_name, event in events.items(): 

470 if event.is_received_early(): 

471 continue 

472 else: 

473 actual_event = event.wait() 

474 if actual_event.status == 'completed': 

475 continue 

476 # If we get here, we have an event that was not completed, 

477 # nor skipped via exit_wait_early(). Decide whether to 

478 # keep waiting by calling the error_callback() hook. 

479 decision = error_callback(event_name, instance) 

480 if decision is False: 480 ↛ 481line 480 didn't jump to line 481 because the condition on line 480 was never true

481 break 

482 

483 @contextlib.contextmanager 

484 def wait_for_instance_event(self, instance, event_names, deadline=300, 

485 error_callback=None): 

486 """Plan to wait for some events, run some code, then wait. 

487 

488 This context manager will first create plans to wait for the 

489 provided event_names, yield, and then wait for all the scheduled 

490 events to complete. 

491 

492 Note that this uses an eventlet.timeout.Timeout to bound the 

493 operation, so callers should be prepared to catch that 

494 failure and handle that situation appropriately. 

495 

496 If the event is not received by the specified timeout deadline, 

497 eventlet.timeout.Timeout is raised. 

498 

499 If the event is received but did not have a 'completed' 

500 status, a NovaException is raised. If an error_callback is 

501 provided, instead of raising an exception as detailed above 

502 for the failure case, the callback will be called with the 

503 event_name and instance, and can return True to continue 

504 waiting for the rest of the events, False to stop processing, 

505 or raise an exception which will bubble up to the waiter. 

506 

507 If the inner code wishes to abort waiting for one or more 

508 events because it knows some state to be finished or condition 

509 to be satisfied, it can use VirtAPI.exit_wait_early() with a 

510 list of event (name,tag) items to avoid waiting for those 

511 events upon context exit. Note that exit_wait_early() exits 

512 the context immediately and should be used to signal that all 

513 work has been completed and provide the unified list of events 

514 that need not be waited for. Waiting for the remaining events 

515 will begin immediately upon early exit as if the context was 

516 exited normally. 

517 

518 :param instance: The instance for which an event is expected 

519 :param event_names: A list of event names. Each element is a 

520 tuple of strings to indicate (name, tag), 

521 where name is required, but tag may be None. 

522 :param deadline: Maximum number of seconds we should wait for all 

523 of the specified events to arrive. 

524 :param error_callback: A function to be called if an event arrives 

525 

526 """ 

527 

528 if error_callback is None: 

529 error_callback = self._default_error_callback 

530 events = {} 

531 for event_name in event_names: 

532 name, tag = event_name 

533 event_name = objects.InstanceExternalEvent.make_key(name, tag) 

534 try: 

535 event = ( 

536 self._compute.instance_events.prepare_for_instance_event( 

537 instance, name, tag)) 

538 events[event_name] = self._InstanceEvent(event_name, event) 

539 except exception.NovaException: 

540 error_callback(event_name, instance) 

541 # NOTE(danms): Don't wait for any of the events. They 

542 # should all be canceled and fired immediately below, 

543 # but don't stick around if not. 

544 deadline = 0 

545 try: 

546 yield 

547 except self._exit_early_exc as e: 

548 early_events = set([objects.InstanceExternalEvent.make_key(n, t) 

549 for n, t in e.events]) 

550 

551 # If there are expected events that received early, mark them, 

552 # so they won't be waited for later 

553 for early_event_name in early_events: 

554 if early_event_name in events: 554 ↛ 553line 554 didn't jump to line 553 because the condition on line 554 was always true

555 events[early_event_name].mark_as_received_early() 

556 

557 sw = timeutils.StopWatch() 

558 sw.start() 

559 try: 

560 with eventlet.timeout.Timeout(deadline): 

561 self._wait_for_instance_events( 

562 instance, events, error_callback) 

563 except eventlet.timeout.Timeout: 

564 LOG.warning( 

565 'Timeout waiting for %(events)s for instance with ' 

566 'vm_state %(vm_state)s and task_state %(task_state)s. ' 

567 'Event states are: %(event_states)s', 

568 { 

569 'events': list(events.keys()), 

570 'vm_state': instance.vm_state, 

571 'task_state': instance.task_state, 

572 'event_states': 

573 ', '.join([str(event) for event in events.values()]), 

574 }, 

575 instance=instance) 

576 

577 raise 

578 

579 LOG.debug('Instance event wait completed in %i seconds for %s', 

580 sw.elapsed(), 

581 ','.join(x[0] for x in event_names), 

582 instance=instance) 

583 

584 def update_compute_provider_status(self, context, rp_uuid, enabled): 

585 """Used to add/remove the COMPUTE_STATUS_DISABLED trait on the provider 

586 

587 :param context: nova auth RequestContext 

588 :param rp_uuid: UUID of a compute node resource provider in Placement 

589 :param enabled: True if the node is enabled in which case the trait 

590 would be removed, False if the node is disabled in which case 

591 the trait would be added. 

592 :raises: ResourceProviderTraitRetrievalFailed 

593 :raises: ResourceProviderUpdateConflict 

594 :raises: ResourceProviderUpdateFailed 

595 :raises: TraitRetrievalFailed 

596 :raises: keystoneauth1.exceptions.ClientException 

597 """ 

598 trait_name = os_traits.COMPUTE_STATUS_DISABLED 

599 # Get the current traits (and generation) for the provider. 

600 # TODO(mriedem): Leverage the ProviderTree cache in get_provider_traits 

601 trait_info = self.reportclient.get_provider_traits(context, rp_uuid) 

602 # If the host is enabled, remove the trait (if set), else add 

603 # the trait if it doesn't already exist. 

604 original_traits = trait_info.traits 

605 new_traits = None 

606 if enabled and trait_name in original_traits: 

607 new_traits = original_traits - {trait_name} 

608 LOG.debug('Removing trait %s from compute node resource ' 

609 'provider %s in placement.', trait_name, rp_uuid) 

610 elif not enabled and trait_name not in original_traits: 

611 new_traits = original_traits | {trait_name} 

612 LOG.debug('Adding trait %s to compute node resource ' 

613 'provider %s in placement.', trait_name, rp_uuid) 

614 

615 if new_traits is not None: 

616 self.reportclient.set_traits_for_provider( 

617 context, rp_uuid, new_traits, generation=trait_info.generation) 

618 

619 

620class ComputeManager(manager.Manager): 

621 """Manages the running instances from creation to destruction.""" 

622 

623 target = messaging.Target(version='6.4') 

624 

625 def __init__(self, compute_driver=None, *args, **kwargs): 

626 """Load configuration options and connect to the hypervisor.""" 

627 # We want the ComputeManager, ResourceTracker and ComputeVirtAPI all 

628 # using the same instance of SchedulerReportClient which has the 

629 # ProviderTree cache for this compute service. 

630 # NOTE(danms): We do not use the global placement client 

631 # singleton here, because the above-mentioned stack of objects 

632 # maintain local state in the client. Thus, keeping our own 

633 # private object for that stack avoids any potential conflict 

634 # with other users in our process outside of the above. 

635 self.reportclient = report.SchedulerReportClient() 

636 self.virtapi = ComputeVirtAPI(self) 

637 self.network_api = neutron.API() 

638 self.volume_api = cinder.API() 

639 self.manila_api = manila.API() 

640 self.image_api = glance.API() 

641 self._last_bw_usage_poll = 0.0 

642 self.compute_api = compute.API() 

643 self.compute_rpcapi = compute_rpcapi.ComputeAPI() 

644 self.compute_task_api = conductor.ComputeTaskAPI() 

645 self.query_client = query.SchedulerQueryClient() 

646 self.instance_events = InstanceEvents() 

647 self._sync_power_pool = eventlet.GreenPool( 

648 size=CONF.sync_power_state_pool_size) 

649 self._syncs_in_progress = {} 

650 self.send_instance_updates = ( 

651 CONF.filter_scheduler.track_instance_changes) 

652 if CONF.max_concurrent_builds != 0: 

653 self._build_semaphore = eventlet.semaphore.Semaphore( 

654 CONF.max_concurrent_builds) 

655 else: 

656 self._build_semaphore = compute_utils.UnlimitedSemaphore() 

657 if CONF.max_concurrent_snapshots > 0: 

658 self._snapshot_semaphore = eventlet.semaphore.Semaphore( 

659 CONF.max_concurrent_snapshots) 

660 else: 

661 self._snapshot_semaphore = compute_utils.UnlimitedSemaphore() 

662 if CONF.max_concurrent_live_migrations > 0: 

663 self._live_migration_executor = futurist.GreenThreadPoolExecutor( 

664 max_workers=CONF.max_concurrent_live_migrations) 

665 else: 

666 # CONF.max_concurrent_live_migrations is 0 (unlimited) 

667 self._live_migration_executor = futurist.GreenThreadPoolExecutor() 

668 # This is a dict, keyed by instance uuid, to a two-item tuple of 

669 # migration object and Future for the queued live migration. 

670 self._waiting_live_migrations = {} 

671 

672 super(ComputeManager, self).__init__(service_name="compute", 

673 *args, **kwargs) 

674 

675 # TODO(sbauza): Remove this call once we delete the V5Proxy class 

676 self.additional_endpoints.append(_ComputeV5Proxy(self)) 

677 

678 # NOTE(russellb) Load the driver last. It may call back into the 

679 # compute manager via the virtapi, so we want it to be fully 

680 # initialized before that happens. 

681 self.service_ref = None 

682 self.driver = driver.load_compute_driver(self.virtapi, compute_driver) 

683 self.rt = resource_tracker.ResourceTracker( 

684 self.host, self.driver, reportclient=self.reportclient) 

685 

686 def reset(self): 

687 LOG.info('Reloading compute RPC API') 

688 compute_rpcapi.reset_globals() 

689 self.compute_rpcapi = compute_rpcapi.ComputeAPI() 

690 self.reportclient.clear_provider_cache() 

691 

692 def _update_resource_tracker(self, context, instance): 

693 """Let the resource tracker know that an instance has changed state.""" 

694 

695 if instance.host == self.host: 

696 self.rt.update_usage(context, instance, instance.node) 

697 

698 def _instance_update(self, context, instance, **kwargs): 

699 """Update an instance in the database using kwargs as value.""" 

700 

701 for k, v in kwargs.items(): 

702 setattr(instance, k, v) 

703 instance.save() 

704 self._update_resource_tracker(context, instance) 

705 

706 def _nil_out_instance_obj_host_and_node(self, instance): 

707 # NOTE(jwcroppe): We don't do instance.save() here for performance 

708 # reasons; a call to this is expected to be immediately followed by 

709 # another call that does instance.save(), thus avoiding two writes 

710 # to the database layer. 

711 instance.host = None 

712 instance.node = None 

713 instance.compute_id = None 

714 # ResourceTracker._set_instance_host_and_node also sets launched_on 

715 # to the same value as host and is really only ever used by legacy 

716 # nova-network code, but we should also null it out to avoid confusion 

717 # if there is an instance in the database with no host set but 

718 # launched_on is set. Note that we do not care about using launched_on 

719 # as some kind of debug helper if diagnosing a build failure, that is 

720 # what instance action events are for. 

721 instance.launched_on = None 

722 # If the instance is not on a host, it's not in an aggregate and 

723 # therefore is not in an availability zone. 

724 instance.availability_zone = None 

725 

726 def _set_instance_obj_error_state(self, instance, clean_task_state=False): 

727 try: 

728 instance.vm_state = vm_states.ERROR 

729 if clean_task_state: 

730 instance.task_state = None 

731 instance.save() 

732 except exception.InstanceNotFound: 

733 LOG.debug('Instance has been destroyed from under us while ' 

734 'trying to set it to ERROR', instance=instance) 

735 

736 def _get_instances_on_driver(self, context, filters=None): 

737 """Return a list of instance records for the instances found 

738 on the hypervisor which satisfy the specified filters. If filters=None 

739 return a list of instance records for all the instances found on the 

740 hypervisor. 

741 """ 

742 if not filters: 

743 filters = {} 

744 try: 

745 driver_uuids = self.driver.list_instance_uuids() 

746 if len(driver_uuids) == 0: 

747 # Short circuit, don't waste a DB call 

748 return objects.InstanceList() 

749 filters['uuid'] = driver_uuids 

750 local_instances = objects.InstanceList.get_by_filters( 

751 context, filters, use_slave=True) 

752 return local_instances 

753 except NotImplementedError: 

754 pass 

755 

756 # The driver doesn't support uuids listing, so we'll have 

757 # to brute force. 

758 driver_instances = self.driver.list_instances() 

759 # NOTE(mjozefcz): In this case we need to apply host filter. 

760 # Without this all instance data would be fetched from db. 

761 filters['host'] = self.host 

762 instances = objects.InstanceList.get_by_filters(context, filters, 

763 use_slave=True) 

764 name_map = {instance.name: instance for instance in instances} 

765 local_instances = [] 

766 for driver_instance in driver_instances: 

767 instance = name_map.get(driver_instance) 

768 if not instance: 768 ↛ 769line 768 didn't jump to line 769 because the condition on line 768 was never true

769 continue 

770 local_instances.append(instance) 

771 return local_instances 

772 

773 def _destroy_evacuated_instances(self, context, node_cache): 

774 """Destroys evacuated instances. 

775 

776 While nova-compute was down, the instances running on it could be 

777 evacuated to another host. This method looks for evacuation migration 

778 records where this is the source host and which were either started 

779 (accepted), in-progress (pre-migrating) or migrated (done). From those 

780 migration records, local instances reported by the hypervisor are 

781 compared to the instances for the migration records and those local 

782 guests are destroyed, along with instance allocation records in 

783 Placement for this node. 

784 Then allocations are removed from Placement for every instance that is 

785 evacuated from this host regardless if the instance is reported by the 

786 hypervisor or not. 

787 

788 :param context: The request context 

789 :param node_cache: A dict of ComputeNode objects keyed by the UUID of 

790 the compute node 

791 :return: A dict keyed by instance uuid mapped to Migration objects 

792 for instances that were migrated away from this host 

793 """ 

794 filters = { 

795 'source_compute': self.host, 

796 # NOTE(mriedem): Migration records that have been accepted are 

797 # included in case the source node comes back up while instances 

798 # are being evacuated to another host. We don't want the same 

799 # instance being reported from multiple hosts. 

800 # NOTE(lyarwood): pre-migrating is also included here as the 

801 # source compute can come back online shortly after the RT 

802 # claims on the destination that in-turn moves the migration to 

803 # pre-migrating. If the evacuate fails on the destination host, 

804 # the user can rebuild the instance (in ERROR state) on the source 

805 # host. 

806 'status': ['accepted', 'pre-migrating', 'done'], 

807 'migration_type': fields.MigrationType.EVACUATION, 

808 } 

809 with utils.temporary_mutation(context, read_deleted='yes'): 

810 evacuations = objects.MigrationList.get_by_filters(context, 

811 filters) 

812 if not evacuations: 

813 return {} 

814 evacuations = {mig.instance_uuid: mig for mig in evacuations} 

815 

816 # TODO(mriedem): We could optimize by pre-loading the joined fields 

817 # we know we'll use, like info_cache and flavor. 

818 local_instances = self._get_instances_on_driver(context) 

819 evacuated_local_instances = {inst.uuid: inst 

820 for inst in local_instances 

821 if inst.uuid in evacuations} 

822 

823 for instance in evacuated_local_instances.values(): 

824 LOG.info('Destroying instance as it has been evacuated from ' 

825 'this host but still exists in the hypervisor', 

826 instance=instance) 

827 try: 

828 network_info = self.network_api.get_instance_nw_info( 

829 context, instance) 

830 bdi = self._get_instance_block_device_info(context, 

831 instance) 

832 evac = evacuations[instance.uuid] 

833 destroy_disks = not (self._is_instance_storage_shared( 

834 context, instance, host=evac.dest_compute)) 

835 except exception.InstanceNotFound: 

836 network_info = network_model.NetworkInfo() 

837 bdi = {} 

838 LOG.info('Instance has been marked deleted already, ' 

839 'removing it from the hypervisor.', 

840 instance=instance) 

841 # always destroy disks if the instance was deleted 

842 destroy_disks = True 

843 self.driver.destroy(context, instance, 

844 network_info, 

845 bdi, destroy_disks) 

846 

847 hostname_to_cn_uuid = { 

848 cn.hypervisor_hostname: cn.uuid 

849 for cn in node_cache.values()} 

850 

851 for instance_uuid, migration in evacuations.items(): 

852 try: 

853 if instance_uuid in evacuated_local_instances: 

854 # Avoid the db call if we already have the instance loaded 

855 # above 

856 instance = evacuated_local_instances[instance_uuid] 

857 else: 

858 instance = objects.Instance.get_by_uuid( 

859 context, instance_uuid) 

860 except exception.InstanceNotFound: 

861 # The instance already deleted so we expect that every 

862 # allocation of that instance has already been cleaned up 

863 continue 

864 

865 LOG.info('Cleaning up allocations of the instance as it has been ' 

866 'evacuated from this host', 

867 instance=instance) 

868 if migration.source_node not in hostname_to_cn_uuid: 

869 LOG.error("Failed to clean allocation of evacuated " 

870 "instance as the source node %s is not found", 

871 migration.source_node, instance=instance) 

872 continue 

873 cn_uuid = hostname_to_cn_uuid[migration.source_node] 

874 

875 # If the instance was deleted in the interim, assume its 

876 # allocations were properly cleaned up (either by its hosting 

877 # compute service or the API). 

878 if (not instance.deleted and 878 ↛ 882line 878 didn't jump to line 882 because the condition on line 878 was never true

879 not self.reportclient. 

880 remove_provider_tree_from_instance_allocation( 

881 context, instance.uuid, cn_uuid)): 

882 LOG.error("Failed to clean allocation of evacuated instance " 

883 "on the source node %s", 

884 cn_uuid, instance=instance) 

885 

886 migration.status = 'completed' 

887 migration.save() 

888 return evacuations 

889 

890 def _is_instance_storage_shared(self, context, instance, host=None): 

891 shared_storage = True 

892 data = None 

893 try: 

894 data = self.driver.check_instance_shared_storage_local(context, 

895 instance) 

896 if data: 896 ↛ 910line 896 didn't jump to line 910 because the condition on line 896 was always true

897 shared_storage = (self.compute_rpcapi. 

898 check_instance_shared_storage(context, 

899 data, instance=instance, host=host)) 

900 except NotImplementedError: 

901 LOG.debug('Hypervisor driver does not support ' 

902 'instance shared storage check, ' 

903 'assuming it\'s not on shared storage', 

904 instance=instance) 

905 shared_storage = False 

906 except Exception: 

907 LOG.exception('Failed to check if instance shared', 

908 instance=instance) 

909 finally: 

910 if data: 

911 self.driver.check_instance_shared_storage_cleanup(context, 

912 data) 

913 return shared_storage 

914 

915 def _complete_partial_deletion(self, context, instance): 

916 """Complete deletion for instances in DELETED status but not marked as 

917 deleted in the DB 

918 """ 

919 instance.destroy() 

920 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

921 context, instance.uuid) 

922 self._complete_deletion(context, 

923 instance) 

924 self._notify_about_instance_usage(context, instance, "delete.end") 

925 compute_utils.notify_about_instance_action(context, instance, 

926 self.host, action=fields.NotificationAction.DELETE, 

927 phase=fields.NotificationPhase.END, bdms=bdms) 

928 

929 def _complete_deletion(self, context, instance): 

930 self._update_resource_tracker(context, instance) 

931 

932 # If we're configured to do deferred deletes, don't force deletion of 

933 # allocations if there's a conflict. 

934 force = False if CONF.reclaim_instance_interval > 0 else True 

935 

936 self.reportclient.delete_allocation_for_instance(context, 

937 instance.uuid, 

938 force=force) 

939 

940 self._clean_instance_console_tokens(context, instance) 

941 self._delete_scheduler_instance_info(context, instance.uuid) 

942 

943 def _validate_pinning_configuration(self, instances): 

944 if not self.driver.capabilities.get('supports_pcpus', False): 

945 return 

946 

947 for instance in instances: 

948 # ignore deleted instances 

949 if instance.deleted: 

950 continue 

951 

952 # if this is an unpinned instance and the host only has 

953 # 'cpu_dedicated_set' configured, we need to tell the operator to 

954 # correct their configuration 

955 if not instance.numa_topology or ( 

956 instance.numa_topology.cpu_policy in ( 

957 None, fields.CPUAllocationPolicy.SHARED 

958 ) 

959 ): 

960 # we don't need to check 'vcpu_pin_set' since it can't coexist 

961 # alongside 'cpu_dedicated_set' 

962 if (CONF.compute.cpu_dedicated_set and 

963 not CONF.compute.cpu_shared_set): 

964 msg = _("This host has unpinned instances but has no CPUs " 

965 "set aside for this purpose; configure '[compute] " 

966 "cpu_shared_set' instead of, or in addition to, " 

967 "'[compute] cpu_dedicated_set'") 

968 raise exception.InvalidConfiguration(msg) 

969 

970 continue 

971 

972 # ditto for pinned instances if only 'cpu_shared_set' is configured 

973 if (CONF.compute.cpu_shared_set and 

974 not CONF.compute.cpu_dedicated_set and 

975 not CONF.vcpu_pin_set): 

976 msg = _("This host has pinned instances but has no CPUs " 

977 "set aside for this purpose; configure '[compute] " 

978 "cpu_dedicated_set' instead of, or in addition to, " 

979 "'[compute] cpu_shared_set'.") 

980 raise exception.InvalidConfiguration(msg) 

981 

982 # if this is a mixed instance with both pinned and unpinned CPUs, 

983 # the host must have both 'cpu_dedicated_set' and 'cpu_shared_set' 

984 # configured. check if 'cpu_shared_set' is set. 

985 if (instance.numa_topology.cpu_policy == 

986 fields.CPUAllocationPolicy.MIXED and 

987 not CONF.compute.cpu_shared_set): 

988 msg = _("This host has mixed instance requesting both pinned " 

989 "and unpinned CPUs but hasn't set aside unpinned CPUs " 

990 "for this purpose; Configure " 

991 "'[compute] cpu_shared_set'.") 

992 raise exception.InvalidConfiguration(msg) 

993 

994 # for mixed instance check if 'cpu_dedicated_set' is set. 

995 if (instance.numa_topology.cpu_policy == 995 ↛ 998line 995 didn't jump to line 998 because the condition on line 995 was never true

996 fields.CPUAllocationPolicy.MIXED and 

997 not CONF.compute.cpu_dedicated_set): 

998 msg = _("This host has mixed instance requesting both pinned " 

999 "and unpinned CPUs but hasn't set aside pinned CPUs " 

1000 "for this purpose; Configure " 

1001 "'[compute] cpu_dedicated_set'") 

1002 raise exception.InvalidConfiguration(msg) 

1003 

1004 # also check to make sure the operator hasn't accidentally 

1005 # dropped some cores that instances are currently using 

1006 available_dedicated_cpus = (hardware.get_vcpu_pin_set() or 

1007 hardware.get_cpu_dedicated_set()) 

1008 pinned_cpus = instance.numa_topology.cpu_pinning 

1009 if available_dedicated_cpus and ( 

1010 pinned_cpus - available_dedicated_cpus): 

1011 # we can't raise an exception because of bug #1289064, 

1012 # which meant we didn't recalculate CPU pinning information 

1013 # when we live migrated a pinned instance 

1014 LOG.warning( 

1015 "Instance is pinned to host CPUs %(cpus)s " 

1016 "but one or more of these CPUs are not included in " 

1017 "either '[compute] cpu_dedicated_set' or " 

1018 "'vcpu_pin_set'; you should update these " 

1019 "configuration options to include the missing CPUs " 

1020 "or rebuild or cold migrate this instance.", 

1021 {'cpus': list(pinned_cpus)}, 

1022 instance=instance) 

1023 

1024 def _validate_vtpm_configuration(self, instances): 

1025 if self.driver.capabilities.get('supports_vtpm', False): 

1026 return 

1027 

1028 for instance in instances: 

1029 if instance.deleted: 1029 ↛ 1030line 1029 didn't jump to line 1030 because the condition on line 1029 was never true

1030 continue 

1031 

1032 # NOTE(stephenfin): We don't have an attribute on the instance to 

1033 # check for this, so we need to inspect the flavor/image metadata 

1034 if hardware.get_vtpm_constraint( 

1035 instance.flavor, instance.image_meta, 

1036 ): 

1037 msg = _( 

1038 'This host has instances with the vTPM feature enabled, ' 

1039 'but the host is not correctly configured; enable ' 

1040 'vTPM support.' 

1041 ) 

1042 raise exception.InvalidConfiguration(msg) 

1043 

1044 def _reset_live_migration(self, context, instance): 

1045 migration = None 

1046 try: 

1047 migration = objects.Migration.get_by_instance_and_status( 

1048 context, instance.uuid, 'running') 

1049 if migration: 1049 ↛ 1055line 1049 didn't jump to line 1055 because the condition on line 1049 was always true

1050 self.live_migration_abort(context, instance, migration.id) 

1051 except Exception: 

1052 LOG.exception('Failed to abort live-migration', 

1053 instance=instance) 

1054 finally: 

1055 if migration: 1055 ↛ 1057line 1055 didn't jump to line 1057 because the condition on line 1055 was always true

1056 self._set_migration_status(migration, 'error') 

1057 LOG.info('Instance found in migrating state during ' 

1058 'startup. Resetting task_state', 

1059 instance=instance) 

1060 instance.task_state = None 

1061 instance.save(expected_task_state=[task_states.MIGRATING]) 

1062 

1063 def _init_instance(self, context, instance): 

1064 """Initialize this instance during service init.""" 

1065 

1066 # NOTE(danms): If the instance appears to not be owned by this 

1067 # host, it may have been evacuated away, but skipped by the 

1068 # evacuation cleanup code due to configuration. Thus, if that 

1069 # is a possibility, don't touch the instance in any way, but 

1070 # log the concern. This will help avoid potential issues on 

1071 # startup due to misconfiguration. 

1072 if instance.host != self.host: 

1073 LOG.warning('Instance %(uuid)s appears to not be owned ' 

1074 'by this host, but by %(host)s. Startup ' 

1075 'processing is being skipped.', 

1076 {'uuid': instance.uuid, 

1077 'host': instance.host}) 

1078 return 

1079 

1080 # Instances that are shut down, or in an error state can not be 

1081 # initialized and are not attempted to be recovered. The exception 

1082 # to this are instances that are in RESIZE_MIGRATING or DELETING, 

1083 # which are dealt with further down. 

1084 if (instance.vm_state == vm_states.SOFT_DELETED or 1084 ↛ 1088line 1084 didn't jump to line 1088 because the condition on line 1084 was never true

1085 (instance.vm_state == vm_states.ERROR and 

1086 instance.task_state not in 

1087 (task_states.RESIZE_MIGRATING, task_states.DELETING))): 

1088 LOG.debug("Instance is in %s state.", 

1089 instance.vm_state, instance=instance) 

1090 return 

1091 

1092 if instance.vm_state == vm_states.DELETED: 

1093 try: 

1094 self._complete_partial_deletion(context, instance) 

1095 except Exception: 

1096 # we don't want that an exception blocks the init_host 

1097 LOG.exception('Failed to complete a deletion', 

1098 instance=instance) 

1099 return 

1100 

1101 if (instance.vm_state == vm_states.BUILDING or 

1102 instance.task_state in [task_states.SCHEDULING, 

1103 task_states.BLOCK_DEVICE_MAPPING, 

1104 task_states.NETWORKING, 

1105 task_states.SPAWNING]): 

1106 # NOTE(dave-mcnally) compute stopped before instance was fully 

1107 # spawned so set to ERROR state. This is safe to do as the state 

1108 # may be set by the api but the host is not so if we get here the 

1109 # instance has already been scheduled to this particular host. 

1110 LOG.debug("Instance failed to spawn correctly, " 

1111 "setting to ERROR state", instance=instance) 

1112 self._set_instance_obj_error_state(instance, clean_task_state=True) 

1113 return 

1114 

1115 if (instance.vm_state in [vm_states.ACTIVE, vm_states.STOPPED] and 

1116 instance.task_state in [task_states.REBUILDING, 

1117 task_states.REBUILD_BLOCK_DEVICE_MAPPING, 

1118 task_states.REBUILD_SPAWNING]): 

1119 # NOTE(jichenjc) compute stopped before instance was fully 

1120 # spawned so set to ERROR state. This is consistent to BUILD 

1121 LOG.debug("Instance failed to rebuild correctly, " 

1122 "setting to ERROR state", instance=instance) 

1123 self._set_instance_obj_error_state(instance, clean_task_state=True) 

1124 return 

1125 

1126 if (instance.vm_state != vm_states.ERROR and 

1127 instance.task_state in [task_states.IMAGE_SNAPSHOT_PENDING, 

1128 task_states.IMAGE_PENDING_UPLOAD, 

1129 task_states.IMAGE_UPLOADING, 

1130 task_states.IMAGE_SNAPSHOT]): 

1131 LOG.debug("Instance in transitional state %s at start-up " 

1132 "clearing task state", 

1133 instance.task_state, instance=instance) 

1134 instance.task_state = None 

1135 instance.save() 

1136 

1137 if (instance.vm_state != vm_states.ERROR and 

1138 instance.task_state in [task_states.RESIZE_PREP]): 

1139 LOG.debug("Instance in transitional state %s at start-up " 

1140 "clearing task state", 

1141 instance['task_state'], instance=instance) 

1142 instance.task_state = None 

1143 instance.save() 

1144 

1145 if instance.task_state == task_states.DELETING: 

1146 try: 

1147 LOG.info('Service started deleting the instance during ' 

1148 'the previous run, but did not finish. Restarting' 

1149 ' the deletion now.', instance=instance) 

1150 instance.obj_load_attr('metadata') 

1151 instance.obj_load_attr('system_metadata') 

1152 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

1153 context, instance.uuid) 

1154 self._delete_instance(context, instance, bdms) 

1155 except Exception: 

1156 # we don't want that an exception blocks the init_host 

1157 LOG.exception('Failed to complete a deletion', 

1158 instance=instance) 

1159 self._set_instance_obj_error_state(instance) 

1160 return 

1161 

1162 current_power_state = self._get_power_state(instance) 

1163 try_reboot, reboot_type = self._retry_reboot( 

1164 instance, current_power_state) 

1165 

1166 # NOTE(amorin) 

1167 # If the instance is in power_state_SHUTDOWN, we will try_reboot 

1168 if try_reboot: 

1169 LOG.debug("Instance in transitional state (%(task_state)s) at " 

1170 "start-up and power state is (%(power_state)s), " 

1171 "triggering reboot", 

1172 {'task_state': instance.task_state, 

1173 'power_state': current_power_state}, 

1174 instance=instance) 

1175 

1176 # NOTE(mikal): if the instance was doing a soft reboot that got as 

1177 # far as shutting down the instance but not as far as starting it 

1178 # again, then we've just become a hard reboot. That means the 

1179 # task state for the instance needs to change so that we're in one 

1180 # of the expected task states for a hard reboot. 

1181 if (instance.task_state in task_states.soft_reboot_states and 

1182 reboot_type == 'HARD'): 

1183 instance.task_state = task_states.REBOOT_PENDING_HARD 

1184 instance.save() 

1185 

1186 self.reboot_instance(context, instance, block_device_info=None, 

1187 reboot_type=reboot_type) 

1188 return 

1189 

1190 # NOTE(plestang): an instance might be in power_state.RUNNING with a 

1191 # transient state when a host is brutally shutdown or rebooted while a 

1192 # reboot/pause/unpause is scheduled on client side 

1193 elif (current_power_state == power_state.RUNNING and 

1194 instance.task_state in [task_states.REBOOT_STARTED, 

1195 task_states.REBOOT_STARTED_HARD, 

1196 task_states.REBOOTING_HARD, 

1197 task_states.REBOOTING, 

1198 task_states.PAUSING, 

1199 task_states.UNPAUSING]): 

1200 LOG.warning("Instance in transitional state " 

1201 "(%(task_state)s) at start-up and power state " 

1202 "is (%(power_state)s), clearing task state", 

1203 {'task_state': instance.task_state, 

1204 'power_state': current_power_state}, 

1205 instance=instance) 

1206 instance.task_state = None 

1207 instance.vm_state = vm_states.ACTIVE 

1208 instance.save() 

1209 elif (current_power_state == power_state.PAUSED and 

1210 instance.task_state == task_states.UNPAUSING): 

1211 LOG.warning("Instance in transitional state " 

1212 "(%(task_state)s) at start-up and power state " 

1213 "is (%(power_state)s), clearing task state " 

1214 "and unpausing the instance", 

1215 {'task_state': instance.task_state, 

1216 'power_state': current_power_state}, 

1217 instance=instance) 

1218 try: 

1219 self.unpause_instance(context, instance) 

1220 except NotImplementedError: 

1221 # Some virt driver didn't support pause and unpause 

1222 pass 

1223 except Exception: 

1224 LOG.exception('Failed to unpause instance', instance=instance) 

1225 return 

1226 

1227 if instance.task_state == task_states.POWERING_OFF: 

1228 try: 

1229 LOG.debug("Instance in transitional state %s at start-up " 

1230 "retrying stop request", 

1231 instance.task_state, instance=instance) 

1232 self.stop_instance(context, instance, True) 

1233 except Exception: 

1234 # we don't want that an exception blocks the init_host 

1235 LOG.exception('Failed to stop instance', instance=instance) 

1236 return 

1237 

1238 if instance.task_state == task_states.POWERING_ON: 

1239 try: 

1240 LOG.debug("Instance in transitional state %s at start-up " 

1241 "retrying start request", 

1242 instance.task_state, instance=instance) 

1243 self.start_instance(context, instance) 

1244 except Exception: 

1245 # we don't want that an exception blocks the init_host 

1246 LOG.exception('Failed to start instance', instance=instance) 

1247 return 

1248 

1249 net_info = instance.get_network_info() 

1250 try: 

1251 self.driver.plug_vifs(instance, net_info) 

1252 except NotImplementedError as e: 

1253 LOG.debug(e, instance=instance) 

1254 except exception.VirtualInterfacePlugException: 

1255 # NOTE(mriedem): If we get here, it could be because the vif_type 

1256 # in the cache is "binding_failed" or "unbound". 

1257 # The periodic task _heal_instance_info_cache checks for this 

1258 # condition. It should fix this by binding the ports again when 

1259 # it gets to this instance. 

1260 LOG.exception('Virtual interface plugging failed for instance. ' 

1261 'The port binding:host_id may need to be manually ' 

1262 'updated.', instance=instance) 

1263 self._set_instance_obj_error_state(instance) 

1264 return 

1265 except exception.PciDeviceNotFoundById: 

1266 # This is bug 1981813 where the bound port vnic_type has changed 

1267 # from direct to macvtap. Nova does not support that and it 

1268 # already printed an ERROR when the change is detected during 

1269 # _heal_instance_info_cache. Now we print an ERROR again and skip 

1270 # plugging the vifs but let the service startup continue to init 

1271 # the other instances 

1272 LOG.exception( 

1273 'Virtual interface plugging failed for instance. Probably the ' 

1274 'vnic_type of the bound port has been changed. Nova does not ' 

1275 'support such change.', 

1276 instance=instance 

1277 ) 

1278 return 

1279 

1280 if instance.task_state == task_states.RESIZE_MIGRATING: 

1281 # We crashed during resize/migration, so roll back for safety 

1282 try: 

1283 # NOTE(mriedem): check old_vm_state for STOPPED here, if it's 

1284 # not in system_metadata we default to True for backwards 

1285 # compatibility 

1286 power_on = (instance.system_metadata.get('old_vm_state') != 

1287 vm_states.STOPPED) 

1288 

1289 block_dev_info = self._get_instance_block_device_info(context, 

1290 instance) 

1291 

1292 migration = objects.Migration.get_by_id_and_instance( 

1293 context, instance.migration_context.migration_id, 

1294 instance.uuid) 

1295 self.driver.finish_revert_migration(context, instance, 

1296 net_info, migration, block_dev_info, power_on) 

1297 

1298 except Exception: 

1299 LOG.exception('Failed to revert crashed migration', 

1300 instance=instance) 

1301 finally: 

1302 LOG.info('Instance found in migrating state during ' 

1303 'startup. Resetting task_state', 

1304 instance=instance) 

1305 instance.task_state = None 

1306 instance.save() 

1307 if instance.task_state == task_states.MIGRATING: 

1308 # Live migration did not complete, but instance is on this 

1309 # host. Abort ongoing migration if still running and reset state. 

1310 self._reset_live_migration(context, instance) 

1311 

1312 db_state = instance.power_state 

1313 drv_state = self._get_power_state(instance) 

1314 expect_running = (db_state == power_state.RUNNING and 

1315 drv_state != db_state) 

1316 

1317 LOG.debug('Current state is %(drv_state)s, state in DB is ' 

1318 '%(db_state)s.', 

1319 {'drv_state': drv_state, 'db_state': db_state}, 

1320 instance=instance) 

1321 

1322 if expect_running and CONF.resume_guests_state_on_host_boot: 

1323 self._resume_guests_state(context, instance, net_info) 

1324 

1325 def _resume_guests_state(self, context, instance, net_info): 

1326 LOG.info('Rebooting instance after nova-compute restart.', 

1327 instance=instance) 

1328 block_device_info = \ 

1329 self._get_instance_block_device_info(context, instance) 

1330 

1331 share_info = self._get_share_info(context, instance) 

1332 self._mount_all_shares(context, instance, share_info) 

1333 

1334 try: 

1335 self.driver.resume_state_on_host_boot( 

1336 context, instance, net_info, share_info, block_device_info) 

1337 except NotImplementedError: 

1338 LOG.warning('Hypervisor driver does not support ' 

1339 'resume guests', instance=instance) 

1340 except Exception: 

1341 # NOTE(vish): The instance failed to resume, so we set the 

1342 # instance to error and attempt to continue. 

1343 LOG.warning('Failed to resume instance', 

1344 instance=instance) 

1345 self._set_instance_obj_error_state(instance) 

1346 

1347 def _retry_reboot(self, instance, current_power_state): 

1348 current_task_state = instance.task_state 

1349 retry_reboot = False 

1350 reboot_type = compute_utils.get_reboot_type(current_task_state, 

1351 current_power_state) 

1352 

1353 pending_soft = ( 

1354 current_task_state == task_states.REBOOT_PENDING and 

1355 instance.vm_state in vm_states.ALLOW_SOFT_REBOOT) 

1356 pending_hard = ( 

1357 current_task_state == task_states.REBOOT_PENDING_HARD and 

1358 instance.vm_state in vm_states.ALLOW_HARD_REBOOT) 

1359 started_not_running = (current_task_state in 

1360 [task_states.REBOOTING, 

1361 task_states.REBOOTING_HARD, 

1362 task_states.REBOOT_STARTED, 

1363 task_states.REBOOT_STARTED_HARD] and 

1364 current_power_state != power_state.RUNNING) 

1365 

1366 if pending_soft or pending_hard or started_not_running: 

1367 retry_reboot = True 

1368 

1369 return retry_reboot, reboot_type 

1370 

1371 def handle_lifecycle_event(self, event): 

1372 LOG.info("VM %(state)s (Lifecycle Event)", 

1373 {'state': event.get_name()}, 

1374 instance_uuid=event.get_instance_uuid()) 

1375 context = nova.context.get_admin_context(read_deleted='yes') 

1376 vm_power_state = None 

1377 event_transition = event.get_transition() 

1378 if event_transition == virtevent.EVENT_LIFECYCLE_STOPPED: 

1379 vm_power_state = power_state.SHUTDOWN 

1380 elif event_transition == virtevent.EVENT_LIFECYCLE_STARTED: 

1381 vm_power_state = power_state.RUNNING 

1382 elif event_transition in ( 

1383 virtevent.EVENT_LIFECYCLE_PAUSED, 

1384 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED, 

1385 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED): 

1386 vm_power_state = power_state.PAUSED 

1387 elif event_transition == virtevent.EVENT_LIFECYCLE_RESUMED: 

1388 vm_power_state = power_state.RUNNING 

1389 elif event_transition == virtevent.EVENT_LIFECYCLE_SUSPENDED: 

1390 vm_power_state = power_state.SUSPENDED 

1391 else: 

1392 LOG.warning("Unexpected lifecycle event: %d", event_transition) 

1393 

1394 migrate_finish_statuses = { 

1395 # This happens on the source node and indicates live migration 

1396 # entered post-copy mode. 

1397 virtevent.EVENT_LIFECYCLE_POSTCOPY_STARTED: 'running (post-copy)', 

1398 # Suspended for offline migration. 

1399 virtevent.EVENT_LIFECYCLE_MIGRATION_COMPLETED: 'running' 

1400 } 

1401 

1402 expected_attrs = [] 

1403 if event_transition in migrate_finish_statuses: 

1404 # Join on info_cache since that's needed in migrate_instance_start. 

1405 expected_attrs.append('info_cache') 

1406 instance = objects.Instance.get_by_uuid(context, 

1407 event.get_instance_uuid(), 

1408 expected_attrs=expected_attrs) 

1409 

1410 # Note(lpetrut): The event may be delayed, thus not reflecting 

1411 # the current instance power state. In that case, ignore the event. 

1412 current_power_state = self._get_power_state(instance) 

1413 if current_power_state == vm_power_state: 

1414 LOG.debug('Synchronizing instance power state after lifecycle ' 

1415 'event "%(event)s"; current vm_state: %(vm_state)s, ' 

1416 'current task_state: %(task_state)s, current DB ' 

1417 'power_state: %(db_power_state)s, VM power_state: ' 

1418 '%(vm_power_state)s', 

1419 {'event': event.get_name(), 

1420 'vm_state': instance.vm_state, 

1421 'task_state': instance.task_state, 

1422 'db_power_state': instance.power_state, 

1423 'vm_power_state': vm_power_state}, 

1424 instance_uuid=instance.uuid) 

1425 self._sync_instance_power_state(context, 

1426 instance, 

1427 vm_power_state) 

1428 

1429 # The following checks are for live migration. We want to activate 

1430 # the port binding for the destination host before the live migration 

1431 # is resumed on the destination host in order to reduce network 

1432 # downtime. Otherwise the ports are bound to the destination host 

1433 # in post_live_migration_at_destination. 

1434 # TODO(danms): Explore options for using a different live migration 

1435 # specific callback for this instead of piggy-backing on the 

1436 # handle_lifecycle_event callback. 

1437 if (instance.task_state == task_states.MIGRATING and 

1438 event_transition in migrate_finish_statuses): 

1439 status = migrate_finish_statuses[event_transition] 

1440 try: 

1441 migration = objects.Migration.get_by_instance_and_status( 

1442 context, instance.uuid, status) 

1443 LOG.debug('Binding ports to destination host: %s', 

1444 migration.dest_compute, instance=instance) 

1445 # For neutron, migrate_instance_start will activate the 

1446 # destination host port bindings, if there are any created by 

1447 # conductor before live migration started. 

1448 self.network_api.migrate_instance_start( 

1449 context, instance, migration) 

1450 except exception.MigrationNotFoundByStatus: 

1451 LOG.warning("Unable to find migration record with status " 

1452 "'%s' for instance. Port binding will happen in " 

1453 "post live migration.", status, instance=instance) 

1454 

1455 def handle_events(self, event): 

1456 if isinstance(event, virtevent.LifecycleEvent): 1456 ↛ 1463line 1456 didn't jump to line 1463 because the condition on line 1456 was always true

1457 try: 

1458 self.handle_lifecycle_event(event) 

1459 except exception.InstanceNotFound: 

1460 LOG.debug("Event %s arrived for non-existent instance. The " 

1461 "instance was probably deleted.", event) 

1462 else: 

1463 LOG.debug("Ignoring event %s", event) 

1464 

1465 def init_virt_events(self): 

1466 if CONF.workarounds.handle_virt_lifecycle_events: 

1467 self.driver.register_event_listener(self.handle_events) 

1468 else: 

1469 # NOTE(mriedem): If the _sync_power_states periodic task is 

1470 # disabled we should emit a warning in the logs. 

1471 if CONF.sync_power_state_interval < 0: 1471 ↛ 1472line 1471 didn't jump to line 1472 because the condition on line 1471 was never true

1472 LOG.warning('Instance lifecycle events from the compute ' 

1473 'driver have been disabled. Note that lifecycle ' 

1474 'changes to an instance outside of the compute ' 

1475 'service will not be synchronized ' 

1476 'automatically since the _sync_power_states ' 

1477 'periodic task is also disabled.') 

1478 else: 

1479 LOG.info('Instance lifecycle events from the compute ' 

1480 'driver have been disabled. Note that lifecycle ' 

1481 'changes to an instance outside of the compute ' 

1482 'service will only be synchronized by the ' 

1483 '_sync_power_states periodic task.') 

1484 

1485 def _get_nodes(self, context): 

1486 """Queried the ComputeNode objects from the DB that are reported by the 

1487 hypervisor. 

1488 

1489 :param context: the request context 

1490 :return: a dict of ComputeNode objects keyed by the UUID of the given 

1491 node. 

1492 """ 

1493 try: 

1494 node_ids = self.driver.get_nodenames_by_uuid() 

1495 except exception.VirtDriverNotReady: 

1496 LOG.warning( 

1497 "Virt driver is not ready. If this is the first time this " 

1498 "service is starting on this host, then you can ignore " 

1499 "this warning.") 

1500 return {} 

1501 

1502 nodes = objects.ComputeNodeList.get_all_by_uuids(context, 

1503 list(node_ids.keys())) 

1504 if not nodes: 

1505 # NOTE(danms): This should only happen if the compute_id is 

1506 # pre-provisioned on a host that has never started. 

1507 LOG.warning('Compute nodes %s for host %s were not found in the ' 

1508 'database. If this is the first time this service is ' 

1509 'starting on this host, then you can ignore this ' 

1510 'warning.', 

1511 list(node_ids.keys()), self.host) 

1512 return {} 

1513 

1514 for node in nodes: 

1515 if node.hypervisor_hostname != node_ids.get(node.uuid): 

1516 raise exception.InvalidConfiguration( 

1517 ('My compute node %s has hypervisor_hostname %s ' 

1518 'but virt driver reports it should be %s. Possible ' 

1519 'rename detected, refusing to start!') % ( 

1520 node.uuid, node.hypervisor_hostname, 

1521 node_ids.get(node.uuid))) 

1522 

1523 return {n.uuid: n for n in nodes} 

1524 

1525 def _ensure_existing_node_identity(self, service_ref): 

1526 """If we are upgrading from an older service version, we need 

1527 to write our node identity uuid (if not already done) based on 

1528 nodes assigned to us in the database. 

1529 """ 

1530 if 'ironic' in CONF.compute_driver.lower(): 

1531 # We do not persist a single local node identity for 

1532 # ironic 

1533 return 

1534 

1535 if service_ref.version >= service_obj.NODE_IDENTITY_VERSION: 

1536 # Already new enough, nothing to do here, but make sure that we 

1537 # have a UUID file already, as this is not our first time starting. 

1538 if nova.virt.node.read_local_node_uuid() is None: 

1539 raise exception.InvalidConfiguration( 

1540 ('No local node identity found, but this is not our ' 

1541 'first startup on this host. Refusing to start after ' 

1542 'potentially having lost that state!')) 

1543 return 

1544 

1545 if nova.virt.node.read_local_node_uuid(): 

1546 # We already have a local node identity, no migration needed 

1547 return 

1548 

1549 context = nova.context.get_admin_context() 

1550 db_nodes = objects.ComputeNodeList.get_all_by_host(context, self.host) 

1551 if not db_nodes: 

1552 # This means we have no nodes in the database (that we 

1553 # know of) and thus have no need to record an existing 

1554 # UUID. That is probably strange, so log a warning. 

1555 raise exception.InvalidConfiguration( 

1556 ('Upgrading from service version %i but found no ' 

1557 'nodes in the database for host %s to persist ' 

1558 'locally; Possible rename detected, ' 

1559 'refusing to start!') % ( 

1560 service_ref.version, self.host)) 

1561 

1562 if len(db_nodes) > 1: 

1563 # If this happens we can't do the right thing, so raise an 

1564 # exception to abort host startup 

1565 LOG.warning('Multiple nodes found in the database for host %s; ' 

1566 'unable to persist local node identity automatically') 

1567 raise exception.InvalidConfiguration( 

1568 'Multiple nodes found in database, manual node uuid ' 

1569 'configuration required') 

1570 

1571 nova.virt.node.write_local_node_uuid(db_nodes[0].uuid) 

1572 

1573 def _check_for_host_rename(self, nodes_by_uuid): 

1574 if 'ironic' in CONF.compute_driver.lower(): 

1575 # Ironic (currently) rebalances nodes at various times, and as 

1576 # such, nodes being discovered as assigned to this host with a 

1577 # different hostname is not surprising. Skip this check for 

1578 # ironic. 

1579 return 

1580 for node in nodes_by_uuid.values(): 

1581 if node.host != self.host: 

1582 raise exception.InvalidConfiguration( 

1583 'My node %s has host %r but my host is %r; ' 

1584 'Possible rename detected, refusing to start!' % ( 

1585 node.uuid, node.host, self.host)) 

1586 LOG.debug('Verified node %s matches my host %s', 

1587 node.uuid, self.host) 

1588 

1589 def _sanity_check_new_host(self): 

1590 instances_on_hv = self.driver.list_instance_uuids() 

1591 if len(instances_on_hv) > 0: 

1592 # This means we have instances on our hypervisor, but we think 

1593 # we are a new host (i.e. we created a new service record). That 

1594 # likely means we're pointed at an empty database or the wrong 

1595 # cell. 

1596 raise exception.InvalidConfiguration( 

1597 'My hypervisor has existing instances, but I appear to be ' 

1598 'a new service in this database. Possible database ' 

1599 'configuration error, refusing to start!') 

1600 

1601 def init_host(self, service_ref): 

1602 """Initialization for a standalone compute service.""" 

1603 

1604 if service_ref: 

1605 # If we are an existing service, check to see if we need 

1606 # to record a locally-persistent node identity because 

1607 # we have upgraded from a previous version. 

1608 self._ensure_existing_node_identity(service_ref) 

1609 else: 

1610 # If we are a new service (in the database), make sure we have no 

1611 # instances on our hypervisor as we would expect. 

1612 self._sanity_check_new_host() 

1613 

1614 if CONF.pci.device_spec: 

1615 # Simply loading the PCI passthrough spec will do a bunch of 

1616 # validation that would otherwise wait until the PciDevTracker is 

1617 # constructed when updating available resources for the compute 

1618 # node(s) in the resource tracker, effectively killing that task. 

1619 # So load up the spec when starting the compute service to 

1620 # flush any invalid configuration early, so we can kill the service 

1621 # if the configuration is wrong. 

1622 whitelist.Whitelist(CONF.pci.device_spec) 

1623 

1624 nova.conf.neutron.register_dynamic_opts(CONF) 

1625 # Even if only libvirt uses them, make it available for all drivers 

1626 nova.conf.devices.register_dynamic_opts(CONF) 

1627 

1628 # Override the number of concurrent disk operations allowed if the 

1629 # user has specified a limit. 

1630 if CONF.compute.max_concurrent_disk_ops != 0: 1630 ↛ 1631line 1630 didn't jump to line 1631 because the condition on line 1630 was never true

1631 compute_utils.disk_ops_semaphore = \ 

1632 eventlet.semaphore.BoundedSemaphore( 

1633 CONF.compute.max_concurrent_disk_ops) 

1634 

1635 if CONF.compute.max_disk_devices_to_attach == 0: 

1636 msg = _('[compute]max_disk_devices_to_attach has been set to 0, ' 

1637 'which will prevent instances from being able to boot. ' 

1638 'Set -1 for unlimited or set >= 1 to limit the maximum ' 

1639 'number of disk devices.') 

1640 raise exception.InvalidConfiguration(msg) 

1641 

1642 self.driver.init_host(host=self.host) 

1643 

1644 # NOTE(gibi): At this point the compute_nodes of the resource tracker 

1645 # has not been populated yet so we cannot rely on the resource tracker 

1646 # here. 

1647 context = nova.context.get_admin_context() 

1648 nodes_by_uuid = self._get_nodes(context) 

1649 

1650 # NOTE(danms): Check for a possible host rename and abort 

1651 # startup before we start mucking with instances we think are 

1652 # ours. 

1653 self._check_for_host_rename(nodes_by_uuid) 

1654 

1655 instances = objects.InstanceList.get_by_host( 

1656 context, self.host, 

1657 expected_attrs=['info_cache', 'metadata', 'numa_topology']) 

1658 

1659 self.init_virt_events() 

1660 

1661 self._validate_pinning_configuration(instances) 

1662 self._validate_vtpm_configuration(instances) 

1663 

1664 # NOTE(gibi): If ironic and vcenter virt driver slow start time 

1665 # becomes problematic here then we should consider adding a config 

1666 # option or a driver flag to tell us if we should thread 

1667 # _destroy_evacuated_instances and 

1668 # _error_out_instances_whose_build_was_interrupted out in the 

1669 # background on startup 

1670 try: 

1671 # checking that instance was not already evacuated to other host 

1672 evacuated_instances = self._destroy_evacuated_instances( 

1673 context, nodes_by_uuid) 

1674 

1675 # Initialise instances on the host that are not evacuating 

1676 for instance in instances: 

1677 if instance.uuid not in evacuated_instances: 

1678 self._init_instance(context, instance) 

1679 

1680 # NOTE(gibi): collect all the instance uuids that is in some way 

1681 # was already handled above. Either by init_instance or by 

1682 # _destroy_evacuated_instances. This way we can limit the scope of 

1683 # the _error_out_instances_whose_build_was_interrupted call to look 

1684 # only for instances that have allocations on this node and not 

1685 # handled by the above calls. 

1686 already_handled = {instance.uuid for instance in instances}.union( 

1687 evacuated_instances) 

1688 self._error_out_instances_whose_build_was_interrupted( 

1689 context, already_handled, nodes_by_uuid.keys()) 

1690 

1691 finally: 

1692 if instances: 

1693 # We only send the instance info to the scheduler on startup 

1694 # if there is anything to send, otherwise this host might 

1695 # not be mapped yet in a cell and the scheduler may have 

1696 # issues dealing with the information. Later changes to 

1697 # instances on this host will update the scheduler, or the 

1698 # _sync_scheduler_instance_info periodic task will. 

1699 self._update_scheduler_instance_info(context, instances) 

1700 

1701 def _error_out_instances_whose_build_was_interrupted( 

1702 self, context, already_handled_instances, node_uuids): 

1703 """If there are instances in BUILDING state that are not 

1704 assigned to this host but have allocations in placement towards 

1705 this compute that means the nova-compute service was 

1706 restarted while those instances waited for the resource claim 

1707 to finish and the _set_instance_host_and_node() to update the 

1708 instance.host field. We need to push them to ERROR state here to 

1709 prevent keeping them in BUILDING state forever. 

1710 

1711 :param context: The request context 

1712 :param already_handled_instances: The set of instance UUIDs that the 

1713 host initialization process already handled in some way. 

1714 :param node_uuids: The list of compute node uuids handled by this 

1715 service 

1716 """ 

1717 

1718 # Strategy: 

1719 # 1) Get the allocations from placement for our compute node(s) 

1720 # 2) Remove the already handled instances from the consumer list; 

1721 # they are either already initialized or need to be skipped. 

1722 # 3) Check which remaining consumer is an instance in BUILDING state 

1723 # and push it to ERROR state. 

1724 

1725 LOG.info( 

1726 "Looking for unclaimed instances stuck in BUILDING status for " 

1727 "nodes managed by this host") 

1728 for cn_uuid in node_uuids: 

1729 try: 

1730 f = self.reportclient.get_allocations_for_resource_provider 

1731 allocations = f(context, cn_uuid).allocations 

1732 except (exception.ResourceProviderAllocationRetrievalFailed, 

1733 keystone_exception.ClientException) as e: 

1734 LOG.error( 

1735 "Could not retrieve compute node resource provider %s and " 

1736 "therefore unable to error out any instances stuck in " 

1737 "BUILDING state. Error: %s", cn_uuid, str(e)) 

1738 continue 

1739 

1740 not_handled_consumers = (set(allocations) - 

1741 already_handled_instances) 

1742 

1743 if not not_handled_consumers: 

1744 continue 

1745 

1746 filters = { 

1747 'vm_state': vm_states.BUILDING, 

1748 'uuid': not_handled_consumers 

1749 } 

1750 

1751 instances = objects.InstanceList.get_by_filters( 

1752 context, filters, expected_attrs=[]) 

1753 

1754 for instance in instances: 

1755 LOG.debug( 

1756 "Instance spawn was interrupted before instance_claim, " 

1757 "setting instance to ERROR state", instance=instance) 

1758 self._set_instance_obj_error_state( 

1759 instance, clean_task_state=True) 

1760 

1761 def cleanup_host(self): 

1762 self.driver.register_event_listener(None) 

1763 self.instance_events.cancel_all_events() 

1764 self.driver.cleanup_host(host=self.host) 

1765 self._cleanup_live_migrations_in_pool() 

1766 

1767 def _cleanup_live_migrations_in_pool(self): 

1768 # Shutdown the pool so we don't get new requests. 

1769 self._live_migration_executor.shutdown(wait=False) 

1770 # For any queued migrations, cancel the migration and update 

1771 # its status. 

1772 for migration, future in self._waiting_live_migrations.values(): 

1773 # If we got here before the Future was submitted then we need 

1774 # to move on since there isn't anything we can do. 

1775 if future is None: 

1776 continue 

1777 if future.cancel(): 1777 ↛ 1782line 1777 didn't jump to line 1782 because the condition on line 1777 was always true

1778 self._set_migration_status(migration, 'cancelled') 

1779 LOG.info('Successfully cancelled queued live migration.', 

1780 instance_uuid=migration.instance_uuid) 

1781 else: 

1782 LOG.warning('Unable to cancel live migration.', 

1783 instance_uuid=migration.instance_uuid) 

1784 self._waiting_live_migrations.clear() 

1785 

1786 def pre_start_hook(self, service_ref): 

1787 """After the service is initialized, but before we fully bring 

1788 the service up by listening on RPC queues, make sure to update 

1789 our available resources (and indirectly our available nodes). 

1790 """ 

1791 self.service_ref = service_ref 

1792 self.rt.set_service_ref(service_ref) 

1793 self.update_available_resource(nova.context.get_admin_context(), 

1794 startup=True) 

1795 

1796 def _get_power_state(self, instance): 

1797 """Retrieve the power state for the given instance.""" 

1798 LOG.debug('Checking state', instance=instance) 

1799 try: 

1800 return self.driver.get_info(instance, use_cache=False).state 

1801 except exception.InstanceNotFound: 

1802 return power_state.NOSTATE 

1803 

1804 def _await_block_device_map_created(self, context, vol_id): 

1805 # TODO(yamahata): creating volume simultaneously 

1806 # reduces creation time? 

1807 # TODO(yamahata): eliminate dumb polling 

1808 start = time.time() 

1809 retries = CONF.block_device_allocate_retries 

1810 # (1) if the configured value is 0, one attempt should be made 

1811 # (2) if the configured value is > 0, then the total number attempts 

1812 # is (retries + 1) 

1813 attempts = 1 

1814 if retries >= 1: 

1815 attempts = retries + 1 

1816 for attempt in range(1, attempts + 1): 

1817 volume = self.volume_api.get(context, vol_id) 

1818 volume_status = volume['status'] 

1819 if volume_status not in ['creating', 'downloading']: 

1820 if volume_status == 'available': 

1821 return attempt 

1822 LOG.warning("Volume id: %(vol_id)s finished being " 

1823 "created but its status is %(vol_status)s.", 

1824 {'vol_id': vol_id, 

1825 'vol_status': volume_status}) 

1826 break 

1827 greenthread.sleep(CONF.block_device_allocate_retries_interval) 

1828 raise exception.VolumeNotCreated(volume_id=vol_id, 

1829 seconds=int(time.time() - start), 

1830 attempts=attempt, 

1831 volume_status=volume_status) 

1832 

1833 def _decode_files(self, injected_files): 

1834 """Base64 decode the list of files to inject.""" 

1835 if not injected_files: 

1836 return [] 

1837 

1838 def _decode(f): 

1839 path, contents = f 

1840 # Py3 raises binascii.Error instead of TypeError as in Py27 

1841 try: 

1842 decoded = base64.b64decode(contents) 

1843 return path, decoded 

1844 except (TypeError, binascii.Error): 

1845 raise exception.Base64Exception(path=path) 

1846 

1847 return [_decode(f) for f in injected_files] 

1848 

1849 def _validate_instance_group_policy(self, context, instance, 

1850 scheduler_hints=None): 

1851 

1852 if CONF.workarounds.disable_group_policy_check_upcall: 

1853 return 

1854 

1855 # NOTE(russellb) Instance group policy is enforced by the scheduler. 

1856 # However, there is a race condition with the enforcement of 

1857 # the policy. Since more than one instance may be scheduled at the 

1858 # same time, it's possible that more than one instance with an 

1859 # anti-affinity policy may end up here. It's also possible that 

1860 # multiple instances with an affinity policy could end up on different 

1861 # hosts. This is a validation step to make sure that starting the 

1862 # instance here doesn't violate the policy. 

1863 if scheduler_hints is not None: 

1864 # only go through here if scheduler_hints is provided, 

1865 # even if it is empty. 

1866 group_hint = scheduler_hints.get('group') 

1867 if not group_hint: 

1868 return 

1869 else: 

1870 # The RequestSpec stores scheduler_hints as key=list pairs 

1871 # so we need to check the type on the value and pull the 

1872 # single entry out. The API request schema validates that 

1873 # the 'group' hint is a single value. 

1874 if isinstance(group_hint, list): 

1875 group_hint = group_hint[0] 

1876 try: 

1877 group = objects.InstanceGroup.get_by_hint( 

1878 context, group_hint 

1879 ) 

1880 except exception.InstanceGroupNotFound: 

1881 return 

1882 else: 

1883 # TODO(ganso): a call to DB can be saved by adding request_spec 

1884 # to rpcapi payload of live_migration, pre_live_migration and 

1885 # check_can_live_migrate_destination 

1886 try: 

1887 group = objects.InstanceGroup.get_by_instance_uuid( 

1888 context, instance.uuid 

1889 ) 

1890 except exception.InstanceGroupNotFound: 

1891 return 

1892 

1893 @utils.synchronized(group['uuid']) 

1894 def _do_validation(context, instance, group): 

1895 if group.policy and 'anti-affinity' == group.policy: 

1896 

1897 # instances on host 

1898 instances_uuids = objects.InstanceList.get_uuids_by_host( 

1899 context, self.host) 

1900 ins_on_host = set(instances_uuids) 

1901 

1902 # instance param is just for logging, the nodename obtained is 

1903 # not actually related to the instance at all 

1904 nodename = self._get_nodename(instance) 

1905 

1906 # instances being migrated to host 

1907 migrations = ( 

1908 objects.MigrationList.get_in_progress_by_host_and_node( 

1909 context, self.host, nodename)) 

1910 migration_vm_uuids = {mig.instance_uuid for mig in migrations} 

1911 

1912 total_instances = migration_vm_uuids | ins_on_host 

1913 

1914 # refresh group to get updated members within locked block 

1915 group = objects.InstanceGroup.get_by_uuid(context, 

1916 group['uuid']) 

1917 members = set(group.members) 

1918 # Determine the set of instance group members on this host 

1919 # which are not the instance in question. This is used to 

1920 # determine how many other members from the same anti-affinity 

1921 # group can be on this host. 

1922 members_on_host = (total_instances & members - 

1923 set([instance.uuid])) 

1924 rules = group.rules 

1925 if rules and 'max_server_per_host' in rules: 1925 ↛ 1928line 1925 didn't jump to line 1928 because the condition on line 1925 was always true

1926 max_server = rules['max_server_per_host'] 

1927 else: 

1928 max_server = 1 

1929 if len(members_on_host) >= max_server: 

1930 raise exception.GroupAffinityViolation( 

1931 instance_uuid=instance.uuid, policy='Anti-affinity') 

1932 

1933 # NOTE(ganso): The check for affinity below does not work and it 

1934 # can easily be violated because the lock happens in different 

1935 # compute hosts. 

1936 # The only fix seems to be a DB lock to perform the check whenever 

1937 # setting the host field to an instance. 

1938 elif group.policy and 'affinity' == group.policy: 1938 ↛ 1939line 1938 didn't jump to line 1939 because the condition on line 1938 was never true

1939 group_hosts = group.get_hosts(exclude=[instance.uuid]) 

1940 if group_hosts and self.host not in group_hosts: 

1941 raise exception.GroupAffinityViolation( 

1942 instance_uuid=instance.uuid, policy='Affinity') 

1943 

1944 _do_validation(context, instance, group) 

1945 

1946 def _log_original_error(self, exc_info, instance_uuid): 

1947 LOG.error('Error: %s', exc_info[1], instance_uuid=instance_uuid, 

1948 exc_info=exc_info) 

1949 

1950 @periodic_task.periodic_task 

1951 def _check_instance_build_time(self, context): 

1952 """Ensure that instances are not stuck in build.""" 

1953 timeout = CONF.instance_build_timeout 

1954 if timeout == 0: 

1955 return 

1956 

1957 filters = {'vm_state': vm_states.BUILDING, 

1958 'host': self.host} 

1959 

1960 building_insts = objects.InstanceList.get_by_filters(context, 

1961 filters, expected_attrs=[], use_slave=True) 

1962 

1963 for instance in building_insts: 

1964 if timeutils.is_older_than(instance.created_at, timeout): 

1965 self._set_instance_obj_error_state(instance) 

1966 LOG.warning("Instance build timed out. Set to error " 

1967 "state.", instance=instance) 

1968 

1969 def _check_instance_exists(self, instance): 

1970 """Ensure an instance with the same name is not already present.""" 

1971 if self.driver.instance_exists(instance): 

1972 raise exception.InstanceExists(name=instance.name) 

1973 

1974 def _allocate_network_async(self, context, instance, requested_networks, 

1975 security_groups, resource_provider_mapping, 

1976 network_arqs): 

1977 """Method used to allocate networks in the background. 

1978 

1979 Broken out for testing. 

1980 """ 

1981 # First check to see if we're specifically not supposed to allocate 

1982 # networks because if so, we can exit early. 

1983 if requested_networks and requested_networks.no_allocate: 

1984 LOG.debug("Not allocating networking since 'none' was specified.", 

1985 instance=instance) 

1986 return network_model.NetworkInfo([]) 

1987 

1988 LOG.debug("Allocating IP information in the background.", 

1989 instance=instance) 

1990 retries = CONF.network_allocate_retries 

1991 attempts = retries + 1 

1992 retry_time = 1 

1993 bind_host_id = self.driver.network_binding_host_id(context, instance) 

1994 for attempt in range(1, attempts + 1): 1994 ↛ exitline 1994 didn't return from function '_allocate_network_async' because the loop on line 1994 didn't complete

1995 try: 

1996 nwinfo = self.network_api.allocate_for_instance( 

1997 context, instance, 

1998 requested_networks=requested_networks, 

1999 security_groups=security_groups, 

2000 bind_host_id=bind_host_id, 

2001 resource_provider_mapping=resource_provider_mapping, 

2002 network_arqs=network_arqs) 

2003 LOG.debug('Instance network_info: |%s|', nwinfo, 

2004 instance=instance) 

2005 instance.system_metadata['network_allocated'] = 'True' 

2006 # NOTE(JoshNang) do not save the instance here, as it can cause 

2007 # races. The caller shares a reference to instance and waits 

2008 # for this async greenthread to finish before calling 

2009 # instance.save(). 

2010 return nwinfo 

2011 except Exception as e: 

2012 log_info = {'attempt': attempt, 

2013 'attempts': attempts} 

2014 if attempt == attempts: 

2015 LOG.exception('Instance failed network setup ' 

2016 'after %(attempts)d attempt(s)', 

2017 log_info) 

2018 raise e 

2019 LOG.warning('Instance failed network setup ' 

2020 '(attempt %(attempt)d of %(attempts)d)', 

2021 log_info, instance=instance) 

2022 time.sleep(retry_time) 

2023 retry_time *= 2 

2024 if retry_time > 30: 

2025 retry_time = 30 

2026 # Not reached. 

2027 

2028 def _build_networks_for_instance(self, context, instance, 

2029 requested_networks, security_groups, resource_provider_mapping, 

2030 network_arqs): 

2031 

2032 # If we're here from a reschedule the network may already be allocated. 

2033 if strutils.bool_from_string( 

2034 instance.system_metadata.get('network_allocated', 'False')): 

2035 # NOTE(alex_xu): The network_allocated is True means the network 

2036 # resource already allocated at previous scheduling, and the 

2037 # network setup is cleanup at previous. After rescheduling, the 

2038 # network resource need setup on the new host. 

2039 self.network_api.setup_instance_network_on_host( 

2040 context, instance, instance.host) 

2041 return self.network_api.get_instance_nw_info(context, instance) 

2042 

2043 network_info = self._allocate_network(context, instance, 

2044 requested_networks, security_groups, 

2045 resource_provider_mapping, 

2046 network_arqs) 

2047 

2048 return network_info 

2049 

2050 def _allocate_network(self, context, instance, requested_networks, 

2051 security_groups, resource_provider_mapping, 

2052 network_arqs): 

2053 """Start network allocation asynchronously. Return an instance 

2054 of NetworkInfoAsyncWrapper that can be used to retrieve the 

2055 allocated networks when the operation has finished. 

2056 """ 

2057 # NOTE(comstud): Since we're allocating networks asynchronously, 

2058 # this task state has little meaning, as we won't be in this 

2059 # state for very long. 

2060 instance.vm_state = vm_states.BUILDING 

2061 instance.task_state = task_states.NETWORKING 

2062 instance.save(expected_task_state=[None]) 

2063 

2064 return network_model.NetworkInfoAsyncWrapper( 

2065 self._allocate_network_async, context, instance, 

2066 requested_networks, security_groups, resource_provider_mapping, 

2067 network_arqs) 

2068 

2069 def _default_root_device_name(self, instance, image_meta, root_bdm): 

2070 """Gets a default root device name from the driver. 

2071 

2072 :param nova.objects.Instance instance: 

2073 The instance for which to get the root device name. 

2074 :param nova.objects.ImageMeta image_meta: 

2075 The metadata of the image of the instance. 

2076 :param nova.objects.BlockDeviceMapping root_bdm: 

2077 The description of the root device. 

2078 :returns: str -- The default root device name. 

2079 :raises: InternalError, TooManyDiskDevices 

2080 """ 

2081 try: 

2082 return self.driver.default_root_device_name(instance, 

2083 image_meta, 

2084 root_bdm) 

2085 except NotImplementedError: 

2086 return compute_utils.get_next_device_name(instance, []) 

2087 

2088 def _default_device_names_for_instance(self, instance, 

2089 root_device_name, 

2090 *block_device_lists): 

2091 """Default the missing device names in the BDM from the driver. 

2092 

2093 :param nova.objects.Instance instance: 

2094 The instance for which to get default device names. 

2095 :param str root_device_name: The root device name. 

2096 :param list block_device_lists: List of block device mappings. 

2097 :returns: None 

2098 :raises: InternalError, TooManyDiskDevices 

2099 """ 

2100 try: 

2101 self.driver.default_device_names_for_instance(instance, 

2102 root_device_name, 

2103 *block_device_lists) 

2104 except NotImplementedError: 

2105 compute_utils.default_device_names_for_instance( 

2106 instance, root_device_name, *block_device_lists) 

2107 

2108 def _get_device_name_for_instance(self, instance, bdms, block_device_obj): 

2109 """Get the next device name from the driver, based on the BDM. 

2110 

2111 :param nova.objects.Instance instance: 

2112 The instance whose volume is requesting a device name. 

2113 :param nova.objects.BlockDeviceMappingList bdms: 

2114 The block device mappings for the instance. 

2115 :param nova.objects.BlockDeviceMapping block_device_obj: 

2116 A block device mapping containing info about the requested block 

2117 device. 

2118 :returns: The next device name. 

2119 :raises: InternalError, TooManyDiskDevices 

2120 """ 

2121 # NOTE(ndipanov): Copy obj to avoid changing the original 

2122 block_device_obj = block_device_obj.obj_clone() 

2123 try: 

2124 return self.driver.get_device_name_for_instance( 

2125 instance, bdms, block_device_obj) 

2126 except NotImplementedError: 

2127 return compute_utils.get_device_name_for_instance( 

2128 instance, bdms, block_device_obj.get("device_name")) 

2129 

2130 def _default_block_device_names(self, instance, image_meta, block_devices): 

2131 """Verify that all the devices have the device_name set. If not, 

2132 provide a default name. 

2133 

2134 It also ensures that there is a root_device_name and is set to the 

2135 first block device in the boot sequence (boot_index=0). 

2136 """ 

2137 root_bdm = block_device.get_root_bdm(block_devices) 

2138 if not root_bdm: 

2139 return 

2140 

2141 # Get the root_device_name from the root BDM or the instance 

2142 root_device_name = None 

2143 update_root_bdm = False 

2144 

2145 if root_bdm.device_name: 

2146 root_device_name = root_bdm.device_name 

2147 instance.root_device_name = root_device_name 

2148 elif instance.root_device_name: 

2149 root_device_name = instance.root_device_name 

2150 root_bdm.device_name = root_device_name 

2151 update_root_bdm = True 

2152 else: 

2153 root_device_name = self._default_root_device_name(instance, 

2154 image_meta, 

2155 root_bdm) 

2156 

2157 instance.root_device_name = root_device_name 

2158 root_bdm.device_name = root_device_name 

2159 update_root_bdm = True 

2160 

2161 if update_root_bdm: 

2162 root_bdm.save() 

2163 

2164 ephemerals = [] 

2165 swap = [] 

2166 block_device_mapping = [] 

2167 image = [] 

2168 

2169 for device in block_devices: 

2170 if block_device.new_format_is_ephemeral(device): 

2171 ephemerals.append(device) 

2172 

2173 if block_device.new_format_is_swap(device): 

2174 swap.append(device) 

2175 

2176 if driver_block_device.is_block_device_mapping(device): 

2177 block_device_mapping.append(device) 

2178 

2179 if driver_block_device.is_local_image(device): 2179 ↛ 2180line 2179 didn't jump to line 2180 because the condition on line 2179 was never true

2180 image.append(device) 

2181 

2182 self._default_device_names_for_instance(instance, 

2183 root_device_name, 

2184 image, 

2185 ephemerals, 

2186 swap, 

2187 block_device_mapping) 

2188 

2189 def _add_missing_dev_names(self, bdms, instance): 

2190 for bdm in bdms: 

2191 if bdm.device_name is not None: 

2192 continue 

2193 

2194 device_name = self._get_device_name_for_instance(instance, 

2195 bdms, bdm) 

2196 values = {'device_name': device_name} 

2197 bdm.update(values) 

2198 bdm.save() 

2199 

2200 def _prep_block_device(self, context, instance, bdms): 

2201 """Set up the block device for an instance with error logging.""" 

2202 try: 

2203 self._add_missing_dev_names(bdms, instance) 

2204 block_device_info = driver.get_block_device_info(instance, bdms) 

2205 mapping = driver.block_device_info_get_mapping(block_device_info) 

2206 driver_block_device.attach_block_devices( 

2207 mapping, context, instance, self.volume_api, self.driver, 

2208 wait_func=self._await_block_device_map_created) 

2209 

2210 return block_device_info 

2211 

2212 except exception.OverQuota as e: 

2213 LOG.warning('Failed to create block device for instance due' 

2214 ' to exceeding volume related resource quota.' 

2215 ' Error: %s', e.message, instance=instance) 

2216 raise 

2217 

2218 except Exception as ex: 

2219 LOG.exception('Instance failed block device setup', 

2220 instance=instance) 

2221 # InvalidBDM will eventually result in a BuildAbortException when 

2222 # booting from volume, and will be recorded as an instance fault. 

2223 # Maintain the original exception message which most likely has 

2224 # useful details which the standard InvalidBDM error message lacks. 

2225 raise exception.InvalidBDM(str(ex)) 

2226 

2227 def _update_instance_after_spawn(self, instance, 

2228 vm_state=vm_states.ACTIVE): 

2229 instance.power_state = self._get_power_state(instance) 

2230 instance.vm_state = vm_state 

2231 instance.task_state = None 

2232 # NOTE(sean-k-mooney): configdrive.update_instance checks 

2233 # instance.launched_at to determine if it is the first or 

2234 # subsequent spawn of an instance. We need to call update_instance 

2235 # first before setting instance.launched_at or instance.config_drive 

2236 # will never be set to true based on the value of force_config_drive. 

2237 # As a result the config drive will be lost on a hard reboot of the 

2238 # instance even when force_config_drive=true. see bug #1835822. 

2239 configdrive.update_instance(instance) 

2240 instance.launched_at = timeutils.utcnow() 

2241 

2242 def _update_scheduler_instance_info(self, context, instance): 

2243 """Sends an InstanceList with created or updated Instance objects to 

2244 the Scheduler client. 

2245 

2246 In the case of init_host, the value passed will already be an 

2247 InstanceList. Other calls will send individual Instance objects that 

2248 have been created or resized. In this case, we create an InstanceList 

2249 object containing that Instance. 

2250 """ 

2251 if not self.send_instance_updates: 

2252 return 

2253 if isinstance(instance, obj_instance.Instance): 

2254 instance = objects.InstanceList(objects=[instance]) 

2255 context = context.elevated() 

2256 self.query_client.update_instance_info(context, self.host, 

2257 instance) 

2258 

2259 def _delete_scheduler_instance_info(self, context, instance_uuid): 

2260 """Sends the uuid of the deleted Instance to the Scheduler client.""" 

2261 if not self.send_instance_updates: 

2262 return 

2263 context = context.elevated() 

2264 self.query_client.delete_instance_info(context, self.host, 

2265 instance_uuid) 

2266 

2267 @periodic_task.periodic_task(spacing=CONF.scheduler_instance_sync_interval) 

2268 def _sync_scheduler_instance_info(self, context): 

2269 if not self.send_instance_updates: 

2270 return 

2271 context = context.elevated() 

2272 instances = objects.InstanceList.get_by_host(context, self.host, 

2273 expected_attrs=[], 

2274 use_slave=True) 

2275 uuids = [instance.uuid for instance in instances] 

2276 self.query_client.sync_instance_info(context, self.host, uuids) 

2277 

2278 def _notify_about_instance_usage(self, context, instance, event_suffix, 

2279 network_info=None, extra_usage_info=None, 

2280 fault=None, best_effort=False): 

2281 compute_utils.notify_about_instance_usage( 

2282 self.notifier, context, instance, event_suffix, 

2283 network_info=network_info, 

2284 extra_usage_info=extra_usage_info, fault=fault, 

2285 best_effort=best_effort) 

2286 

2287 def _deallocate_network(self, context, instance, 

2288 requested_networks=None): 

2289 # If we were told not to allocate networks let's save ourselves 

2290 # the trouble of calling the network API. 

2291 if requested_networks and requested_networks.no_allocate: 

2292 LOG.debug("Skipping network deallocation for instance since " 

2293 "networking was not requested.", instance=instance) 

2294 return 

2295 

2296 LOG.debug('Deallocating network for instance', instance=instance) 

2297 with timeutils.StopWatch() as timer: 

2298 self.network_api.deallocate_for_instance( 

2299 context, instance, requested_networks=requested_networks) 

2300 # nova-network does an rpc call so we're OK tracking time spent here 

2301 LOG.info('Took %0.2f seconds to deallocate network for instance.', 

2302 timer.elapsed(), instance=instance) 

2303 

2304 def _get_instance_block_device_info(self, context, instance, 

2305 refresh_conn_info=False, 

2306 bdms=None): 

2307 """Transform block devices to the driver block_device format.""" 

2308 

2309 if bdms is None: 

2310 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

2311 context, instance.uuid) 

2312 block_device_info = driver.get_block_device_info(instance, bdms) 

2313 

2314 if not refresh_conn_info: 

2315 # if the block_device_mapping has no value in connection_info 

2316 # (returned as None), don't include in the mapping 

2317 block_device_info['block_device_mapping'] = [ 

2318 bdm for bdm in driver.block_device_info_get_mapping( 

2319 block_device_info) 

2320 if bdm.get('connection_info')] 

2321 else: 

2322 driver_block_device.refresh_conn_infos( 

2323 driver.block_device_info_get_mapping(block_device_info), 

2324 context, instance, self.volume_api, self.driver) 

2325 

2326 return block_device_info 

2327 

2328 def _build_failed(self, node): 

2329 if CONF.compute.consecutive_build_service_disable_threshold: 

2330 # NOTE(danms): Update our counter, but wait for the next 

2331 # update_available_resource() periodic to flush it to the DB 

2332 self.rt.build_failed(node) 

2333 

2334 def _build_succeeded(self, node): 

2335 self.rt.build_succeeded(node) 

2336 

2337 @wrap_exception() 

2338 @reverts_task_state 

2339 @wrap_instance_fault 

2340 def build_and_run_instance(self, context, instance, image, request_spec, 

2341 filter_properties, accel_uuids, admin_password=None, 

2342 injected_files=None, requested_networks=None, 

2343 security_groups=None, block_device_mapping=None, 

2344 node=None, limits=None, host_list=None): 

2345 

2346 @utils.synchronized(instance.uuid) 

2347 def _locked_do_build_and_run_instance(*args, **kwargs): 

2348 # NOTE(danms): We grab the semaphore with the instance uuid 

2349 # locked because we could wait in line to build this instance 

2350 # for a while and we want to make sure that nothing else tries 

2351 # to do anything with this instance while we wait. 

2352 with self._build_semaphore: 

2353 try: 

2354 result = self._do_build_and_run_instance(*args, **kwargs) 

2355 except Exception: 

2356 # NOTE(mriedem): This should really only happen if 

2357 # _decode_files in _do_build_and_run_instance fails, and 

2358 # that's before a guest is spawned so it's OK to remove 

2359 # allocations for the instance for this node from Placement 

2360 # below as there is no guest consuming resources anyway. 

2361 # The _decode_files case could be handled more specifically 

2362 # but that's left for another day. 

2363 result = build_results.FAILED 

2364 raise 

2365 finally: 

2366 if result == build_results.FAILED: 

2367 # Remove the allocation records from Placement for the 

2368 # instance if the build failed. The instance.host is 

2369 # likely set to None in _do_build_and_run_instance 

2370 # which means if the user deletes the instance, it 

2371 # will be deleted in the API, not the compute service. 

2372 # Setting the instance.host to None in 

2373 # _do_build_and_run_instance means that the 

2374 # ResourceTracker will no longer consider this instance 

2375 # to be claiming resources against it, so we want to 

2376 # reflect that same thing in Placement. No need to 

2377 # call this for a reschedule, as the allocations will 

2378 # have already been removed in 

2379 # self._do_build_and_run_instance(). 

2380 self.reportclient.delete_allocation_for_instance( 

2381 context, instance.uuid, force=True) 

2382 

2383 if result in (build_results.FAILED_BY_POLICY, 

2384 build_results.RESCHEDULED_BY_POLICY): 

2385 return 

2386 if result in (build_results.FAILED, 

2387 build_results.RESCHEDULED): 

2388 self._build_failed(node) 

2389 else: 

2390 self._build_succeeded(node) 

2391 

2392 # NOTE(danms): We spawn here to return the RPC worker thread back to 

2393 # the pool. Since what follows could take a really long time, we don't 

2394 # want to tie up RPC workers. 

2395 utils.spawn_n(_locked_do_build_and_run_instance, 

2396 context, instance, image, request_spec, 

2397 filter_properties, admin_password, injected_files, 

2398 requested_networks, security_groups, 

2399 block_device_mapping, node, limits, host_list, 

2400 accel_uuids) 

2401 

2402 def _check_device_tagging(self, requested_networks, block_device_mapping): 

2403 tagging_requested = False 

2404 if requested_networks: 

2405 for net in requested_networks: 

2406 if 'tag' in net and net.tag is not None: 

2407 tagging_requested = True 

2408 break 

2409 if block_device_mapping and not tagging_requested: 

2410 for bdm in block_device_mapping: 

2411 if 'tag' in bdm and bdm.tag is not None: 

2412 tagging_requested = True 

2413 break 

2414 if (tagging_requested and 

2415 not self.driver.capabilities.get('supports_device_tagging', 

2416 False)): 

2417 raise exception.BuildAbortException('Attempt to boot guest with ' 

2418 'tagged devices on host that ' 

2419 'does not support tagging.') 

2420 

2421 def _check_trusted_certs(self, instance): 

2422 if (instance.trusted_certs and 

2423 not self.driver.capabilities.get('supports_trusted_certs', 

2424 False)): 

2425 raise exception.BuildAbortException( 

2426 'Trusted image certificates provided on host that does not ' 

2427 'support certificate validation.') 

2428 

2429 @wrap_exception() 

2430 @reverts_task_state 

2431 @wrap_instance_event(prefix='compute') 

2432 @wrap_instance_fault 

2433 def _do_build_and_run_instance(self, context, instance, image, 

2434 request_spec, filter_properties, admin_password, injected_files, 

2435 requested_networks, security_groups, block_device_mapping, 

2436 node=None, limits=None, host_list=None, accel_uuids=None): 

2437 

2438 try: 

2439 LOG.debug('Starting instance...', instance=instance) 

2440 instance.vm_state = vm_states.BUILDING 

2441 instance.task_state = None 

2442 instance.save(expected_task_state= 

2443 (task_states.SCHEDULING, None)) 

2444 except exception.InstanceNotFound: 

2445 msg = 'Instance disappeared before build.' 

2446 LOG.debug(msg, instance=instance) 

2447 return build_results.FAILED 

2448 except exception.UnexpectedTaskStateError as e: 

2449 LOG.debug(e.format_message(), instance=instance) 

2450 return build_results.FAILED 

2451 

2452 # b64 decode the files to inject: 

2453 decoded_files = self._decode_files(injected_files) 

2454 

2455 if limits is None: 

2456 limits = {} 

2457 

2458 if node is None: 

2459 node = self._get_nodename(instance, refresh=True) 

2460 

2461 try: 

2462 with timeutils.StopWatch() as timer: 

2463 self._build_and_run_instance(context, instance, image, 

2464 decoded_files, admin_password, requested_networks, 

2465 security_groups, block_device_mapping, node, limits, 

2466 filter_properties, request_spec, accel_uuids) 

2467 LOG.info('Took %0.2f seconds to build instance.', 

2468 timer.elapsed(), instance=instance) 

2469 return build_results.ACTIVE 

2470 except exception.RescheduledException as e: 

2471 retry = filter_properties.get('retry') 

2472 if not retry: 

2473 # no retry information, do not reschedule. 

2474 LOG.debug("Retry info not present, will not reschedule", 

2475 instance=instance) 

2476 self._cleanup_allocated_networks(context, instance, 

2477 requested_networks) 

2478 compute_utils.add_instance_fault_from_exc(context, 

2479 instance, e, sys.exc_info(), 

2480 fault_message=e.kwargs['reason']) 

2481 self._nil_out_instance_obj_host_and_node(instance) 

2482 self._set_instance_obj_error_state(instance, 

2483 clean_task_state=True) 

2484 if isinstance(e, exception.RescheduledByPolicyException): 

2485 return build_results.FAILED_BY_POLICY 

2486 return build_results.FAILED 

2487 LOG.debug(e.format_message(), instance=instance) 

2488 # This will be used for logging the exception 

2489 retry['exc'] = traceback.format_exception(*sys.exc_info()) 

2490 # This will be used for setting the instance fault message 

2491 retry['exc_reason'] = e.kwargs['reason'] 

2492 

2493 self._cleanup_allocated_networks(context, instance, 

2494 requested_networks) 

2495 

2496 self._nil_out_instance_obj_host_and_node(instance) 

2497 instance.task_state = task_states.SCHEDULING 

2498 instance.save() 

2499 # The instance will have already claimed resources from this host 

2500 # before this build was attempted. Now that it has failed, we need 

2501 # to unclaim those resources before casting to the conductor, so 

2502 # that if there are alternate hosts available for a retry, it can 

2503 # claim resources on that new host for the instance. 

2504 self.reportclient.delete_allocation_for_instance( 

2505 context, instance.uuid, force=True) 

2506 

2507 self.compute_task_api.build_instances(context, [instance], 

2508 image, filter_properties, admin_password, 

2509 injected_files, requested_networks, security_groups, 

2510 block_device_mapping, request_spec=request_spec, 

2511 host_lists=[host_list]) 

2512 

2513 if isinstance(e, exception.RescheduledByPolicyException): 

2514 return build_results.RESCHEDULED_BY_POLICY 

2515 

2516 return build_results.RESCHEDULED 

2517 except (exception.InstanceNotFound, 

2518 exception.UnexpectedDeletingTaskStateError): 

2519 msg = 'Instance disappeared during build.' 

2520 LOG.debug(msg, instance=instance) 

2521 self._cleanup_allocated_networks(context, instance, 

2522 requested_networks) 

2523 return build_results.FAILED 

2524 except Exception as e: 

2525 if isinstance(e, exception.BuildAbortException): 

2526 LOG.error(e.format_message(), instance=instance) 

2527 else: 

2528 # Should not reach here. 

2529 LOG.exception('Unexpected build failure, not rescheduling ' 

2530 'build.', instance=instance) 

2531 self._cleanup_allocated_networks(context, instance, 

2532 requested_networks) 

2533 self._cleanup_volumes(context, instance, 

2534 block_device_mapping, raise_exc=False) 

2535 compute_utils.add_instance_fault_from_exc(context, instance, 

2536 e, sys.exc_info()) 

2537 self._nil_out_instance_obj_host_and_node(instance) 

2538 self._set_instance_obj_error_state(instance, clean_task_state=True) 

2539 return build_results.FAILED 

2540 

2541 @staticmethod 

2542 def _get_scheduler_hints(filter_properties, request_spec=None): 

2543 """Helper method to get scheduler hints. 

2544 

2545 This method prefers to get the hints out of the request spec, but that 

2546 might not be provided. Conductor will pass request_spec down to the 

2547 first compute chosen for a build but older computes will not pass 

2548 the request_spec to conductor's build_instances method for a 

2549 a reschedule, so if we're on a host via a retry, request_spec may not 

2550 be provided so we need to fallback to use the filter_properties 

2551 to get scheduler hints. 

2552 """ 

2553 hints = {} 

2554 if request_spec is not None and 'scheduler_hints' in request_spec: 

2555 hints = request_spec.scheduler_hints 

2556 if not hints: 

2557 hints = filter_properties.get('scheduler_hints') or {} 

2558 return hints 

2559 

2560 @staticmethod 

2561 def _get_request_group_mapping(request_spec): 

2562 """Return request group resource - provider mapping. This is currently 

2563 used for Neutron ports that have resource request due to the port 

2564 having QoS minimum bandwidth policy rule attached. 

2565 

2566 :param request_spec: A RequestSpec object or None 

2567 :returns: A dict keyed by RequestGroup requester_id, currently Neutron 

2568 port_id, to resource provider UUID that provides resource for that 

2569 RequestGroup. Or None if the request_spec was None. 

2570 """ 

2571 # TODO(sbauza): Remove this conditional once we only support 

2572 # RPC API 6.0 

2573 if request_spec: 

2574 return request_spec.get_request_group_mapping() 

2575 else: 

2576 return None 

2577 

2578 def _build_and_run_instance(self, context, instance, image, injected_files, 

2579 admin_password, requested_networks, security_groups, 

2580 block_device_mapping, node, limits, filter_properties, 

2581 request_spec=None, accel_uuids=None): 

2582 

2583 image_name = image.get('name') 

2584 self._notify_about_instance_usage(context, instance, 'create.start', 

2585 extra_usage_info={'image_name': image_name}) 

2586 compute_utils.notify_about_instance_create( 

2587 context, instance, self.host, 

2588 phase=fields.NotificationPhase.START, 

2589 bdms=block_device_mapping) 

2590 

2591 # NOTE(mikal): cache the keystone roles associated with the instance 

2592 # at boot time for later reference 

2593 instance.system_metadata.update( 

2594 {'boot_roles': ','.join(context.roles)}) 

2595 

2596 self._check_device_tagging(requested_networks, block_device_mapping) 

2597 self._check_trusted_certs(instance) 

2598 

2599 provider_mapping = self._get_request_group_mapping(request_spec) 

2600 

2601 if provider_mapping: 

2602 try: 

2603 compute_utils.update_pci_request_with_placement_allocations( 

2604 context, 

2605 self.reportclient, 

2606 instance.pci_requests.requests, 

2607 provider_mapping, 

2608 ) 

2609 except (exception.AmbiguousResourceProviderForPCIRequest, 

2610 exception.UnexpectedResourceProviderNameForPCIRequest 

2611 ) as e: 

2612 raise exception.BuildAbortException( 

2613 reason=str(e), instance_uuid=instance.uuid) 

2614 

2615 # TODO(Luyao) cut over to get_allocs_for_consumer 

2616 allocs = self.reportclient.get_allocations_for_consumer( 

2617 context, instance.uuid) 

2618 

2619 try: 

2620 scheduler_hints = self._get_scheduler_hints(filter_properties, 

2621 request_spec) 

2622 with self.rt.instance_claim(context, instance, node, allocs, 

2623 limits): 

2624 # NOTE(russellb) It's important that this validation be done 

2625 # *after* the resource tracker instance claim, as that is where 

2626 # the host is set on the instance. 

2627 self._validate_instance_group_policy(context, instance, 

2628 scheduler_hints) 

2629 image_meta = objects.ImageMeta.from_dict(image) 

2630 

2631 with self._build_resources(context, instance, 

2632 requested_networks, security_groups, image_meta, 

2633 block_device_mapping, provider_mapping, 

2634 accel_uuids) as resources: 

2635 instance.vm_state = vm_states.BUILDING 

2636 instance.task_state = task_states.SPAWNING 

2637 # NOTE(JoshNang) This also saves the changes to the 

2638 # instance from _allocate_network_async, as they aren't 

2639 # saved in that function to prevent races. 

2640 instance.save(expected_task_state= 

2641 task_states.BLOCK_DEVICE_MAPPING) 

2642 block_device_info = resources['block_device_info'] 

2643 network_info = resources['network_info'] 

2644 accel_info = resources['accel_info'] 

2645 LOG.debug('Start spawning the instance on the hypervisor.', 

2646 instance=instance) 

2647 with timeutils.StopWatch() as timer: 

2648 self.driver.spawn(context, instance, image_meta, 

2649 injected_files, admin_password, 

2650 allocs, network_info=network_info, 

2651 block_device_info=block_device_info, 

2652 accel_info=accel_info) 

2653 LOG.info('Took %0.2f seconds to spawn the instance on ' 

2654 'the hypervisor.', timer.elapsed(), 

2655 instance=instance) 

2656 except (exception.InstanceNotFound, 

2657 exception.UnexpectedDeletingTaskStateError) as e: 

2658 with excutils.save_and_reraise_exception(): 

2659 self._notify_about_instance_usage(context, instance, 

2660 'create.error', fault=e) 

2661 compute_utils.notify_about_instance_create( 

2662 context, instance, self.host, 

2663 phase=fields.NotificationPhase.ERROR, exception=e, 

2664 bdms=block_device_mapping) 

2665 except exception.ComputeResourcesUnavailable as e: 

2666 LOG.debug(e.format_message(), instance=instance) 

2667 self._notify_about_instance_usage(context, instance, 

2668 'create.error', fault=e) 

2669 compute_utils.notify_about_instance_create( 

2670 context, instance, self.host, 

2671 phase=fields.NotificationPhase.ERROR, exception=e, 

2672 bdms=block_device_mapping) 

2673 raise exception.RescheduledException( 

2674 instance_uuid=instance.uuid, reason=e.format_message()) 

2675 except exception.BuildAbortException as e: 

2676 with excutils.save_and_reraise_exception(): 

2677 LOG.debug(e.format_message(), instance=instance) 

2678 self._notify_about_instance_usage(context, instance, 

2679 'create.error', fault=e) 

2680 compute_utils.notify_about_instance_create( 

2681 context, instance, self.host, 

2682 phase=fields.NotificationPhase.ERROR, exception=e, 

2683 bdms=block_device_mapping) 

2684 except exception.NoMoreFixedIps as e: 

2685 LOG.warning('No more fixed IP to be allocated', 

2686 instance=instance) 

2687 self._notify_about_instance_usage(context, instance, 

2688 'create.error', fault=e) 

2689 compute_utils.notify_about_instance_create( 

2690 context, instance, self.host, 

2691 phase=fields.NotificationPhase.ERROR, exception=e, 

2692 bdms=block_device_mapping) 

2693 msg = _('Failed to allocate the network(s) with error %s, ' 

2694 'not rescheduling.') % e.format_message() 

2695 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2696 reason=msg) 

2697 except (exception.ExternalNetworkAttachForbidden, 

2698 exception.VirtualInterfaceCreateException, 

2699 exception.VirtualInterfaceMacAddressException, 

2700 exception.FixedIpInvalidOnHost, 

2701 exception.UnableToAutoAllocateNetwork, 

2702 exception.NetworksWithQoSPolicyNotSupported) as e: 

2703 LOG.exception('Failed to allocate network(s)', 

2704 instance=instance) 

2705 self._notify_about_instance_usage(context, instance, 

2706 'create.error', fault=e) 

2707 compute_utils.notify_about_instance_create( 

2708 context, instance, self.host, 

2709 phase=fields.NotificationPhase.ERROR, exception=e, 

2710 bdms=block_device_mapping) 

2711 msg = _('Failed to allocate the network(s), not rescheduling.') 

2712 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2713 reason=msg) 

2714 except (exception.FlavorDiskTooSmall, 

2715 exception.FlavorMemoryTooSmall, 

2716 exception.ImageNotActive, 

2717 exception.ImageUnacceptable, 

2718 exception.InvalidDiskInfo, 

2719 exception.InvalidDiskFormat, 

2720 cursive_exception.SignatureVerificationError, 

2721 exception.CertificateValidationFailed, 

2722 exception.VolumeEncryptionNotSupported, 

2723 exception.InvalidInput, 

2724 # TODO(mriedem): We should be validating RequestedVRamTooHigh 

2725 # in the API during server create and rebuild. 

2726 exception.RequestedVRamTooHigh) as e: 

2727 self._notify_about_instance_usage(context, instance, 

2728 'create.error', fault=e) 

2729 compute_utils.notify_about_instance_create( 

2730 context, instance, self.host, 

2731 phase=fields.NotificationPhase.ERROR, exception=e, 

2732 bdms=block_device_mapping) 

2733 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2734 reason=e.format_message()) 

2735 except exception.GroupAffinityViolation as e: 

2736 LOG.exception('Failed to build and run instance', 

2737 instance=instance) 

2738 self._notify_about_instance_usage(context, instance, 

2739 'create.error', fault=e) 

2740 compute_utils.notify_about_instance_create( 

2741 context, instance, self.host, 

2742 phase=fields.NotificationPhase.ERROR, exception=e, 

2743 bdms=block_device_mapping) 

2744 raise exception.RescheduledByPolicyException( 

2745 instance_uuid=instance.uuid, reason=str(e)) 

2746 except Exception as e: 

2747 LOG.exception('Failed to build and run instance', 

2748 instance=instance) 

2749 self._notify_about_instance_usage(context, instance, 

2750 'create.error', fault=e) 

2751 compute_utils.notify_about_instance_create( 

2752 context, instance, self.host, 

2753 phase=fields.NotificationPhase.ERROR, exception=e, 

2754 bdms=block_device_mapping) 

2755 raise exception.RescheduledException( 

2756 instance_uuid=instance.uuid, reason=str(e)) 

2757 

2758 # NOTE(alaski): This is only useful during reschedules, remove it now. 

2759 instance.system_metadata.pop('network_allocated', None) 

2760 

2761 # If CONF.default_access_ip_network_name is set, grab the 

2762 # corresponding network and set the access ip values accordingly. 

2763 network_name = CONF.default_access_ip_network_name 

2764 if (network_name and not instance.access_ip_v4 and 

2765 not instance.access_ip_v6): 

2766 # Note that when there are multiple ips to choose from, an 

2767 # arbitrary one will be chosen. 

2768 for vif in network_info: 2768 ↛ 2777line 2768 didn't jump to line 2777 because the loop on line 2768 didn't complete

2769 if vif['network']['label'] == network_name: 2769 ↛ 2768line 2769 didn't jump to line 2768 because the condition on line 2769 was always true

2770 for ip in vif.fixed_ips(): 

2771 if not instance.access_ip_v4 and ip['version'] == 4: 

2772 instance.access_ip_v4 = ip['address'] 

2773 if not instance.access_ip_v6 and ip['version'] == 6: 

2774 instance.access_ip_v6 = ip['address'] 

2775 break 

2776 

2777 self._update_instance_after_spawn(instance) 

2778 

2779 try: 

2780 instance.save(expected_task_state=task_states.SPAWNING) 

2781 except (exception.InstanceNotFound, 

2782 exception.UnexpectedDeletingTaskStateError) as e: 

2783 with excutils.save_and_reraise_exception(): 

2784 self._notify_about_instance_usage(context, instance, 

2785 'create.error', fault=e) 

2786 compute_utils.notify_about_instance_create( 

2787 context, instance, self.host, 

2788 phase=fields.NotificationPhase.ERROR, exception=e, 

2789 bdms=block_device_mapping) 

2790 

2791 self._update_scheduler_instance_info(context, instance) 

2792 self._notify_about_instance_usage(context, instance, 'create.end', 

2793 extra_usage_info={'message': _('Success')}, 

2794 network_info=network_info) 

2795 compute_utils.notify_about_instance_create(context, instance, 

2796 self.host, phase=fields.NotificationPhase.END, 

2797 bdms=block_device_mapping) 

2798 

2799 def _build_resources_cleanup(self, instance, network_info): 

2800 # Make sure the async call finishes 

2801 if network_info is not None: 2801 ↛ 2805line 2801 didn't jump to line 2805 because the condition on line 2801 was always true

2802 network_info.wait(do_raise=False) 

2803 self.driver.clean_networks_preparation(instance, 

2804 network_info) 

2805 self.driver.failed_spawn_cleanup(instance) 

2806 

2807 @contextlib.contextmanager 

2808 def _build_resources(self, context, instance, requested_networks, 

2809 security_groups, image_meta, block_device_mapping, 

2810 resource_provider_mapping, accel_uuids): 

2811 resources = {} 

2812 network_info = None 

2813 spec_arqs = {} 

2814 network_arqs = {} 

2815 

2816 try: 

2817 if accel_uuids: 

2818 arqs = self._get_bound_arq_resources( 

2819 context, instance, accel_uuids) 

2820 spec_arqs, network_arqs = self._split_network_arqs( 

2821 arqs, requested_networks) 

2822 LOG.debug("ARQs for spec:%s, ARQs for network:%s", 

2823 spec_arqs, network_arqs) 

2824 except (Exception, eventlet.timeout.Timeout) as exc: 

2825 LOG.exception(exc) 

2826 # ARQs created for instance or ports. 

2827 # The port binding isn't done yet. 

2828 # Unbind port won't clean port ARQs. 

2829 compute_utils.delete_arqs_if_needed( 

2830 context, instance, accel_uuids) 

2831 msg = _('Failure getting accelerator requests.') 

2832 raise exception.BuildAbortException( 

2833 reason=msg, instance_uuid=instance.uuid) 

2834 

2835 try: 

2836 LOG.debug('Start building networks asynchronously for instance.', 

2837 instance=instance) 

2838 network_info = self._build_networks_for_instance(context, instance, 

2839 requested_networks, security_groups, 

2840 resource_provider_mapping, network_arqs) 

2841 resources['network_info'] = network_info 

2842 except (exception.InstanceNotFound, 

2843 exception.UnexpectedDeletingTaskStateError): 

2844 raise 

2845 except exception.UnexpectedTaskStateError as e: 

2846 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2847 reason=e.format_message()) 

2848 except Exception: 

2849 # Because this allocation is async any failures are likely to occur 

2850 # when the driver accesses network_info during spawn(). 

2851 LOG.exception('Failed to allocate network(s)', 

2852 instance=instance) 

2853 msg = _('Failed to allocate the network(s), not rescheduling.') 

2854 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2855 reason=msg) 

2856 

2857 try: 

2858 # Perform any driver preparation work for the driver. 

2859 self.driver.prepare_for_spawn(instance) 

2860 

2861 # Depending on a virt driver, some network configuration is 

2862 # necessary before preparing block devices. 

2863 self.driver.prepare_networks_before_block_device_mapping( 

2864 instance, network_info) 

2865 

2866 # Verify that all the BDMs have a device_name set and assign a 

2867 # default to the ones missing it with the help of the driver. 

2868 self._default_block_device_names(instance, image_meta, 

2869 block_device_mapping) 

2870 

2871 LOG.debug('Start building block device mappings for instance.', 

2872 instance=instance) 

2873 instance.vm_state = vm_states.BUILDING 

2874 instance.task_state = task_states.BLOCK_DEVICE_MAPPING 

2875 instance.save() 

2876 

2877 block_device_info = self._prep_block_device(context, instance, 

2878 block_device_mapping) 

2879 resources['block_device_info'] = block_device_info 

2880 except (exception.InstanceNotFound, 

2881 exception.UnexpectedDeletingTaskStateError, 

2882 exception.ComputeResourcesUnavailable): 

2883 with excutils.save_and_reraise_exception(): 

2884 self._build_resources_cleanup(instance, network_info) 

2885 except (exception.UnexpectedTaskStateError, 

2886 exception.InstanceUnacceptable, 

2887 exception.OverQuota, exception.InvalidBDM) as e: 

2888 self._build_resources_cleanup(instance, network_info) 

2889 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2890 reason=e.format_message()) 

2891 except Exception: 

2892 LOG.exception('Failure prepping block device', 

2893 instance=instance) 

2894 self._build_resources_cleanup(instance, network_info) 

2895 msg = _('Failure prepping block device.') 

2896 raise exception.BuildAbortException(instance_uuid=instance.uuid, 

2897 reason=msg) 

2898 

2899 resources['accel_info'] = list(spec_arqs.values()) 

2900 try: 

2901 yield resources 

2902 except Exception as exc: 

2903 with excutils.save_and_reraise_exception() as ctxt: 

2904 if not isinstance(exc, ( 

2905 exception.InstanceNotFound, 

2906 exception.UnexpectedDeletingTaskStateError)): 

2907 LOG.exception('Instance failed to spawn', 

2908 instance=instance) 

2909 # Make sure the async call finishes 

2910 if network_info is not None: 2910 ↛ 2916line 2910 didn't jump to line 2916 because the condition on line 2910 was always true

2911 network_info.wait(do_raise=False) 

2912 # if network_info is empty we're likely here because of 

2913 # network allocation failure. Since nothing can be reused on 

2914 # rescheduling it's better to deallocate network to eliminate 

2915 # the chance of orphaned ports in neutron 

2916 deallocate_networks = False if network_info else True 

2917 try: 

2918 self._shutdown_instance(context, instance, 

2919 block_device_mapping, requested_networks, 

2920 try_deallocate_networks=deallocate_networks) 

2921 except Exception as exc2: 

2922 ctxt.reraise = False 

2923 LOG.warning('Could not clean up failed build,' 

2924 ' not rescheduling. Error: %s', 

2925 str(exc2)) 

2926 raise exception.BuildAbortException( 

2927 instance_uuid=instance.uuid, 

2928 reason=str(exc)) 

2929 finally: 

2930 # Call Cyborg to delete accelerator requests 

2931 if accel_uuids: 2931 ↛ 2935line 2931 didn't jump to line 2935 because the condition on line 2931 was never true

2932 # ARQs created for instance or ports. 

2933 # Port bind may not successful. 

2934 # unbind port won't clean port ARQs. 

2935 compute_utils.delete_arqs_if_needed( 

2936 context, instance, accel_uuids) 

2937 

2938 def _get_bound_arq_resources(self, context, instance, arq_uuids): 

2939 """Get bound accelerator requests. 

2940 

2941 The ARQ binding was kicked off in the conductor as an async 

2942 operation. Here we wait for the notification from Cyborg. 

2943 

2944 If the notification arrived before this point, which can happen 

2945 in many/most cases (see [1]), it will be lost. To handle that, 

2946 we use exit_wait_early. 

2947 [1] https://review.opendev.org/#/c/631244/46/nova/compute/ 

2948 manager.py@2627 

2949 

2950 :param instance: instance object 

2951 :param arq_uuids: List of accelerator request (ARQ) UUIDs. 

2952 :returns: List of ARQs for which bindings have completed, 

2953 successfully or otherwise 

2954 """ 

2955 

2956 cyclient = cyborg.get_client(context) 

2957 if arq_uuids is None: 

2958 arqs = cyclient.get_arqs_for_instance(instance.uuid) 

2959 arq_uuids = [arq['uuid'] for arq in arqs] 

2960 events = [('accelerator-request-bound', arq_uuid) 

2961 for arq_uuid in arq_uuids] 

2962 

2963 timeout = CONF.arq_binding_timeout 

2964 with self.virtapi.wait_for_instance_event( 

2965 instance, events, deadline=timeout): 

2966 resolved_arqs = cyclient.get_arqs_for_instance( 

2967 instance.uuid, only_resolved=True) 

2968 # Events for these resolved ARQs may have already arrived. 

2969 # Such 'early' events need to be ignored. 

2970 early_events = [('accelerator-request-bound', arq['uuid']) 

2971 for arq in resolved_arqs] 

2972 if early_events: 

2973 self.virtapi.exit_wait_early(early_events) 

2974 

2975 # Since a timeout in wait_for_instance_event will raise, we get 

2976 # here only if all binding events have been received. 

2977 resolved_uuids = [arq['uuid'] for arq in resolved_arqs] 

2978 if sorted(resolved_uuids) != sorted(arq_uuids): 

2979 # Query Cyborg to get all. 

2980 arqs = cyclient.get_arqs_for_instance(instance.uuid) 

2981 else: 

2982 arqs = resolved_arqs 

2983 return arqs 

2984 

2985 def _split_network_arqs(self, arqs, requested_networks): 

2986 """split arq request by extra spec from ARQ requested by port. 

2987 

2988 Return ARQ groups tuple:(spec_arqs, port_arqs) 

2989 Each item in the tuple is a dict like: 

2990 { 

2991 arq1_uuid: arq1 

2992 } 

2993 """ 

2994 port_arqs = {} 

2995 spec_arqs = {} 

2996 port_arqs_uuids = [req_net.arq_uuid for req_net in requested_networks] 

2997 for arq in arqs: 

2998 if arq['uuid'] in port_arqs_uuids: 

2999 port_arqs.update({arq['uuid']: arq}) 

3000 else: 

3001 spec_arqs.update({arq['uuid']: arq}) 

3002 

3003 return spec_arqs, port_arqs 

3004 

3005 def _cleanup_allocated_networks(self, context, instance, 

3006 requested_networks): 

3007 """Cleanup networks allocated for instance. 

3008 

3009 :param context: nova request context 

3010 :param instance: nova.objects.instance.Instance object 

3011 :param requested_networks: nova.objects.NetworkRequestList 

3012 """ 

3013 LOG.debug('Unplugging VIFs for instance', instance=instance) 

3014 

3015 network_info = instance.get_network_info() 

3016 

3017 # NOTE(stephenfin) to avoid nova destroying the instance without 

3018 # unplugging the interface, refresh network_info if it is empty. 

3019 if not network_info: 3019 ↛ 3032line 3019 didn't jump to line 3032 because the condition on line 3019 was always true

3020 try: 

3021 network_info = self.network_api.get_instance_nw_info( 

3022 context, instance, 

3023 ) 

3024 except Exception as exc: 

3025 LOG.warning( 

3026 'Failed to update network info cache when cleaning up ' 

3027 'allocated networks. Stale VIFs may be left on this host.' 

3028 'Error: %s', str(exc) 

3029 ) 

3030 return 

3031 

3032 try: 

3033 self.driver.unplug_vifs(instance, network_info) 

3034 except NotImplementedError: 

3035 # This is an optional method so ignore things if it doesn't exist 

3036 LOG.debug( 

3037 'Virt driver does not provide unplug_vifs method, so it ' 

3038 'is not possible determine if VIFs should be unplugged.' 

3039 ) 

3040 except Exception as exc: 

3041 # It's possible that the instance never got as far as plugging 

3042 # VIFs, in which case we would see an exception which can be 

3043 # mostly ignored 

3044 LOG.warning( 

3045 'Cleaning up VIFs failed for instance. Error: %s', 

3046 str(exc), instance=instance, 

3047 ) 

3048 else: 

3049 LOG.debug('Unplugged VIFs for instance', instance=instance) 

3050 

3051 try: 

3052 self._deallocate_network(context, instance, requested_networks) 

3053 except Exception: 

3054 LOG.exception('Failed to deallocate networks', instance=instance) 

3055 return 

3056 

3057 instance.system_metadata['network_allocated'] = 'False' 

3058 try: 

3059 instance.save() 

3060 except exception.InstanceNotFound: 

3061 # NOTE(alaski): It's possible that we're cleaning up the networks 

3062 # because the instance was deleted. If that's the case then this 

3063 # exception will be raised by instance.save() 

3064 pass 

3065 

3066 def _try_deallocate_network(self, context, instance, 

3067 requested_networks=None): 

3068 

3069 # During auto-scale cleanup, we could be deleting a large number 

3070 # of servers at the same time and overloading parts of the system, 

3071 # so we retry a few times in case of connection failures to the 

3072 # networking service. 

3073 @loopingcall.RetryDecorator( 

3074 max_retry_count=3, inc_sleep_time=2, max_sleep_time=12, 

3075 exceptions=(keystone_exception.connection.ConnectFailure,)) 

3076 def _deallocate_network_with_retries(): 

3077 try: 

3078 self._deallocate_network( 

3079 context, instance, requested_networks) 

3080 except keystone_exception.connection.ConnectFailure as e: 

3081 # Provide a warning that something is amiss. 

3082 with excutils.save_and_reraise_exception(): 

3083 LOG.warning('Failed to deallocate network for instance; ' 

3084 'retrying. Error: %s', str(e), 

3085 instance=instance) 

3086 

3087 try: 

3088 # tear down allocated network structure 

3089 _deallocate_network_with_retries() 

3090 except Exception as ex: 

3091 with excutils.save_and_reraise_exception(): 

3092 LOG.error('Failed to deallocate network for instance. ' 

3093 'Error: %s', ex, instance=instance) 

3094 self._set_instance_obj_error_state(instance) 

3095 

3096 def _get_power_off_values(self, instance, clean_shutdown): 

3097 """Get the timing configuration for powering down this instance.""" 

3098 if clean_shutdown: 

3099 timeout = compute_utils.get_value_from_system_metadata(instance, 

3100 key='image_os_shutdown_timeout', type=int, 

3101 default=CONF.shutdown_timeout) 

3102 retry_interval = CONF.compute.shutdown_retry_interval 

3103 else: 

3104 timeout = 0 

3105 retry_interval = 0 

3106 

3107 return timeout, retry_interval 

3108 

3109 def _power_off_instance(self, context, instance, clean_shutdown=True): 

3110 """Power off an instance on this host.""" 

3111 share_info = self._get_share_info(context, instance) 

3112 timeout, retry_interval = self._get_power_off_values( 

3113 instance, clean_shutdown) 

3114 self.driver.power_off(instance, timeout, retry_interval) 

3115 share_info.deactivate_all() 

3116 self._umount_all_shares(context, instance, share_info) 

3117 

3118 def _shutdown_instance(self, context, instance, 

3119 bdms, requested_networks=None, notify=True, 

3120 try_deallocate_networks=True): 

3121 """Shutdown an instance on this host. 

3122 

3123 :param:context: security context 

3124 :param:instance: a nova.objects.Instance object 

3125 :param:bdms: the block devices for the instance to be torn 

3126 down 

3127 :param:requested_networks: the networks on which the instance 

3128 has ports 

3129 :param:notify: true if a final usage notification should be 

3130 emitted 

3131 :param:try_deallocate_networks: false if we should avoid 

3132 trying to teardown networking 

3133 """ 

3134 context = context.elevated() 

3135 LOG.info('Terminating instance', instance=instance) 

3136 

3137 if notify: 

3138 self._notify_about_instance_usage(context, instance, 

3139 "shutdown.start") 

3140 compute_utils.notify_about_instance_action(context, instance, 

3141 self.host, action=fields.NotificationAction.SHUTDOWN, 

3142 phase=fields.NotificationPhase.START, bdms=bdms) 

3143 

3144 network_info = instance.get_network_info() 

3145 

3146 share_info = self._get_share_info( 

3147 context, instance, check_status=False 

3148 ) 

3149 

3150 # NOTE(arnaudmorin) to avoid nova destroying the instance without 

3151 # unplugging the interface, refresh network_info if it is empty. 

3152 if not network_info: 

3153 network_info = self.network_api.get_instance_nw_info( 

3154 context, instance) 

3155 

3156 # NOTE(vish) get bdms before destroying the instance 

3157 vol_bdms = [bdm for bdm in bdms if bdm.is_volume] 

3158 block_device_info = self._get_instance_block_device_info( 

3159 context, instance, bdms=bdms) 

3160 

3161 # NOTE(melwitt): attempt driver destroy before releasing ip, may 

3162 # want to keep ip allocated for certain failures 

3163 try: 

3164 LOG.debug('Start destroying the instance on the hypervisor.', 

3165 instance=instance) 

3166 with timeutils.StopWatch() as timer: 

3167 self.driver.destroy(context, instance, network_info, 

3168 block_device_info) 

3169 LOG.info('Took %0.2f seconds to destroy the instance on the ' 

3170 'hypervisor.', timer.elapsed(), instance=instance) 

3171 except exception.InstancePowerOffFailure: 

3172 # if the instance can't power off, don't release the ip 

3173 with excutils.save_and_reraise_exception(): 

3174 pass 

3175 except Exception: 

3176 with excutils.save_and_reraise_exception(): 

3177 # deallocate ip and fail without proceeding to 

3178 # volume api calls, preserving current behavior 

3179 if try_deallocate_networks: 3179 ↛ 3183line 3179 didn't jump to line 3183

3180 self._try_deallocate_network(context, instance, 

3181 requested_networks) 

3182 

3183 if try_deallocate_networks: 

3184 self._try_deallocate_network(context, instance, requested_networks) 

3185 

3186 timer.restart() 

3187 

3188 for share in share_info: 

3189 # If we fail umounting or denying the share we may have a 

3190 # dangling share_mapping in the DB (share_mapping entry with an 

3191 # instance that does not exist anymore). 

3192 try: 

3193 self._umount_share(context, instance, share) 

3194 share.deactivate() 

3195 self.deny_share(context, instance, share) 

3196 except ( 

3197 exception.ShareUmountError, 

3198 exception.ShareNotFound, 

3199 exception.ShareAccessNotFound, 

3200 exception.ShareAccessRemovalError, 

3201 ): 

3202 LOG.warning("An error occurred while unmounting or " 

3203 "denying the share '%s'. This error is ignored to " 

3204 "proceed with instance removal. " 

3205 "Consequently, there may be a dangling " 

3206 "share_mapping.", share.share_id) 

3207 

3208 connector = None 

3209 for bdm in vol_bdms: 

3210 try: 

3211 if bdm.attachment_id: 

3212 self.volume_api.attachment_delete(context, 

3213 bdm.attachment_id) 

3214 else: 

3215 # NOTE(vish): actual driver detach done in driver.destroy, 

3216 # so just tell cinder that we are done with it. 

3217 if connector is None: 

3218 connector = self.driver.get_volume_connector(instance) 

3219 self.volume_api.terminate_connection(context, 

3220 bdm.volume_id, 

3221 connector) 

3222 self.volume_api.detach(context, bdm.volume_id, 

3223 instance.uuid) 

3224 

3225 except exception.VolumeAttachmentNotFound as exc: 

3226 LOG.debug('Ignoring VolumeAttachmentNotFound: %s', exc, 

3227 instance=instance) 

3228 except exception.DiskNotFound as exc: 

3229 LOG.debug('Ignoring DiskNotFound: %s', exc, 

3230 instance=instance) 

3231 except exception.VolumeNotFound as exc: 

3232 LOG.debug('Ignoring VolumeNotFound: %s', exc, 

3233 instance=instance) 

3234 except (cinder_exception.EndpointNotFound, 

3235 keystone_exception.EndpointNotFound) as exc: 

3236 LOG.warning('Ignoring EndpointNotFound for ' 

3237 'volume %(volume_id)s: %(exc)s', 

3238 {'exc': exc, 'volume_id': bdm.volume_id}, 

3239 instance=instance) 

3240 except cinder_exception.ClientException as exc: 

3241 LOG.warning('Ignoring unknown cinder exception for ' 

3242 'volume %(volume_id)s: %(exc)s', 

3243 {'exc': exc, 'volume_id': bdm.volume_id}, 

3244 instance=instance) 

3245 except Exception as exc: 

3246 LOG.warning('Ignoring unknown exception for ' 

3247 'volume %(volume_id)s: %(exc)s', 

3248 {'exc': exc, 'volume_id': bdm.volume_id}, 

3249 instance=instance) 

3250 if vol_bdms: 

3251 LOG.info('Took %(time).2f seconds to detach %(num)s volumes ' 

3252 'for instance.', 

3253 {'time': timer.elapsed(), 'num': len(vol_bdms)}, 

3254 instance=instance) 

3255 

3256 if notify: 

3257 self._notify_about_instance_usage(context, instance, 

3258 "shutdown.end") 

3259 compute_utils.notify_about_instance_action(context, instance, 

3260 self.host, action=fields.NotificationAction.SHUTDOWN, 

3261 phase=fields.NotificationPhase.END, bdms=bdms) 

3262 

3263 def _cleanup_volumes(self, context, instance, bdms, raise_exc=True, 

3264 detach=True): 

3265 original_exception = None 

3266 for bdm in bdms: 

3267 if detach and bdm.volume_id: 

3268 try: 

3269 LOG.debug("Detaching volume: %s", bdm.volume_id, 

3270 instance_uuid=instance.uuid) 

3271 destroy = bdm.delete_on_termination 

3272 self._detach_volume(context, bdm, instance, 

3273 destroy_bdm=destroy) 

3274 except Exception as exc: 

3275 original_exception = exc 

3276 LOG.warning('Failed to detach volume: %(volume_id)s ' 

3277 'due to %(exc)s', 

3278 {'volume_id': bdm.volume_id, 'exc': exc}) 

3279 

3280 if bdm.volume_id and bdm.delete_on_termination: 

3281 try: 

3282 LOG.debug("Deleting volume: %s", bdm.volume_id, 

3283 instance_uuid=instance.uuid) 

3284 self.volume_api.delete(context, bdm.volume_id) 

3285 except Exception as exc: 

3286 original_exception = exc 

3287 LOG.warning('Failed to delete volume: %(volume_id)s ' 

3288 'due to %(exc)s', 

3289 {'volume_id': bdm.volume_id, 'exc': exc}) 

3290 if original_exception is not None and raise_exc: 

3291 raise original_exception 

3292 

3293 def _delete_instance(self, context, instance, bdms): 

3294 """Delete an instance on this host. 

3295 

3296 :param context: nova request context 

3297 :param instance: nova.objects.instance.Instance object 

3298 :param bdms: nova.objects.block_device.BlockDeviceMappingList object 

3299 """ 

3300 events = self.instance_events.clear_events_for_instance(instance) 

3301 if events: 3301 ↛ 3302line 3301 didn't jump to line 3302 because the condition on line 3301 was never true

3302 LOG.debug('Events pending at deletion: %(events)s', 

3303 {'events': ','.join(events.keys())}, 

3304 instance=instance) 

3305 self._notify_about_instance_usage(context, instance, 

3306 "delete.start") 

3307 compute_utils.notify_about_instance_action(context, instance, 

3308 self.host, action=fields.NotificationAction.DELETE, 

3309 phase=fields.NotificationPhase.START, bdms=bdms) 

3310 

3311 self._shutdown_instance(context, instance, bdms) 

3312 

3313 # NOTE(vish): We have already deleted the instance, so we have 

3314 # to ignore problems cleaning up the volumes. It 

3315 # would be nice to let the user know somehow that 

3316 # the volume deletion failed, but it is not 

3317 # acceptable to have an instance that can not be 

3318 # deleted. Perhaps this could be reworked in the 

3319 # future to set an instance fault the first time 

3320 # and to only ignore the failure if the instance 

3321 # is already in ERROR. 

3322 

3323 # NOTE(ameeda): The volumes have already been detached during 

3324 # the above _shutdown_instance() call and this is 

3325 # why detach is not requested from 

3326 # _cleanup_volumes() in this case 

3327 

3328 self._cleanup_volumes(context, instance, bdms, 

3329 raise_exc=False, detach=False) 

3330 # Delete Cyborg ARQs if the instance has a device profile. 

3331 compute_utils.delete_arqs_if_needed(context, instance) 

3332 # if a delete task succeeded, always update vm state and task 

3333 # state without expecting task state to be DELETING 

3334 instance.vm_state = vm_states.DELETED 

3335 instance.task_state = None 

3336 instance.power_state = power_state.NOSTATE 

3337 instance.terminated_at = timeutils.utcnow() 

3338 instance.save() 

3339 

3340 self._complete_deletion(context, instance) 

3341 # only destroy the instance in the db if the _complete_deletion 

3342 # doesn't raise and therefore allocation is successfully 

3343 # deleted in placement 

3344 instance.destroy() 

3345 

3346 self._notify_about_instance_usage(context, instance, "delete.end") 

3347 compute_utils.notify_about_instance_action(context, instance, 

3348 self.host, action=fields.NotificationAction.DELETE, 

3349 phase=fields.NotificationPhase.END, bdms=bdms) 

3350 

3351 @wrap_exception() 

3352 @reverts_task_state 

3353 @wrap_instance_event(prefix='compute') 

3354 @wrap_instance_fault 

3355 def terminate_instance(self, context, instance, bdms): 

3356 """Terminate an instance on this host.""" 

3357 @utils.synchronized(instance.uuid) 

3358 def do_terminate_instance(instance, bdms): 

3359 # NOTE(mriedem): If we are deleting the instance while it was 

3360 # booting from volume, we could be racing with a database update of 

3361 # the BDM volume_id. Since the compute API passes the BDMs over RPC 

3362 # to compute here, the BDMs may be stale at this point. So check 

3363 # for any volume BDMs that don't have volume_id set and if we 

3364 # detect that, we need to refresh the BDM list before proceeding. 

3365 # TODO(mriedem): Move this into _delete_instance and make the bdms 

3366 # parameter optional. 

3367 for bdm in list(bdms): 

3368 if bdm.is_volume and not bdm.volume_id: 

3369 LOG.debug('There are potentially stale BDMs during ' 

3370 'delete, refreshing the BlockDeviceMappingList.', 

3371 instance=instance) 

3372 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

3373 context, instance.uuid) 

3374 break 

3375 try: 

3376 self._delete_instance(context, instance, bdms) 

3377 except exception.InstanceNotFound: 

3378 LOG.info("Instance disappeared during terminate", 

3379 instance=instance) 

3380 except Exception: 

3381 # As we're trying to delete always go to Error if something 

3382 # goes wrong that _delete_instance can't handle. 

3383 with excutils.save_and_reraise_exception(): 

3384 LOG.exception('Setting instance vm_state to ERROR', 

3385 instance=instance) 

3386 self._set_instance_obj_error_state(instance) 

3387 

3388 do_terminate_instance(instance, bdms) 

3389 

3390 # NOTE(johannes): This is probably better named power_off_instance 

3391 # so it matches the driver method, but because of other issues, we 

3392 # can't use that name in grizzly. 

3393 @wrap_exception() 

3394 @reverts_task_state 

3395 @wrap_instance_event(prefix='compute') 

3396 @wrap_instance_fault 

3397 def stop_instance(self, context, instance, clean_shutdown): 

3398 """Stopping an instance on this host.""" 

3399 

3400 @utils.synchronized(instance.uuid) 

3401 def do_stop_instance(): 

3402 current_power_state = self._get_power_state(instance) 

3403 

3404 LOG.debug('Stopping instance; current vm_state: %(vm_state)s, ' 

3405 'current task_state: %(task_state)s, current DB ' 

3406 'power_state: %(db_power_state)s, current VM ' 

3407 'power_state: %(current_power_state)s', 

3408 {'vm_state': instance.vm_state, 

3409 'task_state': instance.task_state, 

3410 'db_power_state': instance.power_state, 

3411 'current_power_state': current_power_state}, 

3412 instance_uuid=instance.uuid) 

3413 

3414 # NOTE(mriedem): If the instance is already powered off, we are 

3415 # possibly tearing down and racing with other operations, so we can 

3416 # expect the task_state to be None if something else updates the 

3417 # instance and we're not locking it. 

3418 expected_task_state = [task_states.POWERING_OFF] 

3419 # The list of power states is from _sync_instance_power_state. 

3420 if current_power_state in (power_state.NOSTATE, 

3421 power_state.SHUTDOWN, 

3422 power_state.CRASHED): 

3423 LOG.info('Instance is already powered off in the ' 

3424 'hypervisor when stop is called.', 

3425 instance=instance) 

3426 expected_task_state.append(None) 

3427 

3428 self._notify_about_instance_usage(context, instance, 

3429 "power_off.start") 

3430 

3431 compute_utils.notify_about_instance_action(context, instance, 

3432 self.host, action=fields.NotificationAction.POWER_OFF, 

3433 phase=fields.NotificationPhase.START) 

3434 

3435 self._power_off_instance(context, instance, clean_shutdown) 

3436 instance.power_state = self._get_power_state(instance) 

3437 instance.vm_state = vm_states.STOPPED 

3438 instance.task_state = None 

3439 instance.save(expected_task_state=expected_task_state) 

3440 self._notify_about_instance_usage(context, instance, 

3441 "power_off.end") 

3442 

3443 compute_utils.notify_about_instance_action(context, instance, 

3444 self.host, action=fields.NotificationAction.POWER_OFF, 

3445 phase=fields.NotificationPhase.END) 

3446 

3447 do_stop_instance() 

3448 

3449 def _power_on(self, context, instance): 

3450 network_info = self.network_api.get_instance_nw_info(context, instance) 

3451 block_device_info = self._get_instance_block_device_info(context, 

3452 instance) 

3453 accel_info = self._get_accel_info(context, instance) 

3454 

3455 share_info = self._get_share_info(context, instance) 

3456 

3457 self._mount_all_shares(context, instance, share_info) 

3458 self.driver.power_on(context, instance, 

3459 network_info, 

3460 block_device_info, accel_info, share_info) 

3461 share_info.activate_all() 

3462 

3463 def _delete_snapshot_of_shelved_instance(self, context, instance, 

3464 snapshot_id): 

3465 """Delete snapshot of shelved instance.""" 

3466 try: 

3467 self.image_api.delete(context, snapshot_id) 

3468 except (exception.ImageNotFound, 

3469 exception.ImageNotAuthorized) as exc: 

3470 LOG.warning("Failed to delete snapshot " 

3471 "from shelved instance (%s).", 

3472 exc.format_message(), instance=instance) 

3473 except Exception: 

3474 LOG.exception("Something wrong happened when trying to " 

3475 "delete snapshot from shelved instance.", 

3476 instance=instance) 

3477 

3478 # NOTE(johannes): This is probably better named power_on_instance 

3479 # so it matches the driver method, but because of other issues, we 

3480 # can't use that name in grizzly. 

3481 @wrap_exception() 

3482 @reverts_task_state 

3483 @wrap_instance_event(prefix='compute') 

3484 @wrap_instance_fault 

3485 def start_instance(self, context, instance): 

3486 """Starting an instance on this host.""" 

3487 self._notify_about_instance_usage(context, instance, "power_on.start") 

3488 compute_utils.notify_about_instance_action(context, instance, 

3489 self.host, action=fields.NotificationAction.POWER_ON, 

3490 phase=fields.NotificationPhase.START) 

3491 self._power_on(context, instance) 

3492 instance.power_state = self._get_power_state(instance) 

3493 instance.vm_state = vm_states.ACTIVE 

3494 instance.task_state = None 

3495 

3496 # Delete an image(VM snapshot) for a shelved instance 

3497 snapshot_id = instance.system_metadata.get('shelved_image_id') 

3498 if snapshot_id: 

3499 self._delete_snapshot_of_shelved_instance(context, instance, 

3500 snapshot_id) 

3501 

3502 # Delete system_metadata for a shelved instance 

3503 compute_utils.remove_shelved_keys_from_system_metadata(instance) 

3504 

3505 instance.save(expected_task_state=task_states.POWERING_ON) 

3506 self._notify_about_instance_usage(context, instance, "power_on.end") 

3507 compute_utils.notify_about_instance_action(context, instance, 

3508 self.host, action=fields.NotificationAction.POWER_ON, 

3509 phase=fields.NotificationPhase.END) 

3510 

3511 @messaging.expected_exceptions(NotImplementedError, 

3512 exception.TriggerCrashDumpNotSupported, 

3513 exception.InstanceNotRunning) 

3514 @wrap_exception() 

3515 @wrap_instance_event(prefix='compute') 

3516 @wrap_instance_fault 

3517 def trigger_crash_dump(self, context, instance): 

3518 """Trigger crash dump in an instance.""" 

3519 

3520 self._notify_about_instance_usage(context, instance, 

3521 "trigger_crash_dump.start") 

3522 compute_utils.notify_about_instance_action(context, instance, 

3523 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP, 

3524 phase=fields.NotificationPhase.START) 

3525 

3526 # This method does not change task_state and power_state because the 

3527 # effect of a trigger depends on user's configuration. 

3528 self.driver.trigger_crash_dump(instance) 

3529 

3530 self._notify_about_instance_usage(context, instance, 

3531 "trigger_crash_dump.end") 

3532 compute_utils.notify_about_instance_action(context, instance, 

3533 self.host, action=fields.NotificationAction.TRIGGER_CRASH_DUMP, 

3534 phase=fields.NotificationPhase.END) 

3535 

3536 @wrap_exception() 

3537 @reverts_task_state 

3538 @wrap_instance_event(prefix='compute') 

3539 @wrap_instance_fault 

3540 def soft_delete_instance(self, context, instance): 

3541 """Soft delete an instance on this host.""" 

3542 with compute_utils.notify_about_instance_delete( 

3543 self.notifier, context, instance, 'soft_delete', 

3544 source=fields.NotificationSource.COMPUTE): 

3545 try: 

3546 self.driver.soft_delete(instance) 

3547 except NotImplementedError: 

3548 # Fallback to just powering off the instance if the 

3549 # hypervisor doesn't implement the soft_delete method 

3550 self._power_off_instance( 

3551 context, instance, clean_shutdown=False 

3552 ) 

3553 instance.power_state = self._get_power_state(instance) 

3554 instance.vm_state = vm_states.SOFT_DELETED 

3555 instance.task_state = None 

3556 instance.save(expected_task_state=[task_states.SOFT_DELETING]) 

3557 

3558 @wrap_exception() 

3559 @reverts_task_state 

3560 @wrap_instance_event(prefix='compute') 

3561 @wrap_instance_fault 

3562 def restore_instance(self, context, instance): 

3563 """Restore a soft-deleted instance on this host.""" 

3564 self._notify_about_instance_usage(context, instance, "restore.start") 

3565 compute_utils.notify_about_instance_action(context, instance, 

3566 self.host, action=fields.NotificationAction.RESTORE, 

3567 phase=fields.NotificationPhase.START) 

3568 try: 

3569 self.driver.restore(instance) 

3570 except NotImplementedError: 

3571 # Fallback to just powering on the instance if the hypervisor 

3572 # doesn't implement the restore method 

3573 self._power_on(context, instance) 

3574 instance.power_state = self._get_power_state(instance) 

3575 instance.vm_state = vm_states.ACTIVE 

3576 instance.task_state = None 

3577 instance.save(expected_task_state=task_states.RESTORING) 

3578 self._notify_about_instance_usage(context, instance, "restore.end") 

3579 compute_utils.notify_about_instance_action(context, instance, 

3580 self.host, action=fields.NotificationAction.RESTORE, 

3581 phase=fields.NotificationPhase.END) 

3582 

3583 @staticmethod 

3584 def _set_migration_status(migration, status): 

3585 """Set the status, and guard against a None being passed in. 

3586 

3587 This is useful as some of the compute RPC calls will not pass 

3588 a migration object in older versions. The check can be removed when 

3589 we move past 4.x major version of the RPC API. 

3590 """ 

3591 if migration: 

3592 migration.status = status 

3593 migration.save() 

3594 

3595 @staticmethod 

3596 def _reimage_failed_callback(event_name, instance): 

3597 msg = ('Cinder reported failure during reimaging ' 

3598 'with %(event)s for instance %(uuid)s') 

3599 msg_args = {'event': event_name, 'uuid': instance.uuid} 

3600 LOG.error(msg, msg_args) 

3601 raise exception.ReimageException(msg % msg_args) 

3602 

3603 def _detach_root_volume(self, context, instance, root_bdm): 

3604 volume_id = root_bdm.volume_id 

3605 mp = root_bdm.device_name 

3606 old_connection_info = jsonutils.loads(root_bdm.connection_info) 

3607 try: 

3608 self.driver.detach_volume(context, old_connection_info, 

3609 instance, root_bdm.device_name) 

3610 except exception.DiskNotFound as err: 

3611 LOG.warning('Ignoring DiskNotFound exception while ' 

3612 'detaching volume %(volume_id)s from ' 

3613 '%(mp)s : %(err)s', 

3614 {'volume_id': volume_id, 'mp': mp, 

3615 'err': err}, instance=instance) 

3616 except exception.DeviceDetachFailed: 

3617 with excutils.save_and_reraise_exception(): 

3618 LOG.warning('Guest refused to detach volume %(vol)s', 

3619 {'vol': volume_id}, instance=instance) 

3620 self.volume_api.roll_detaching(context, volume_id) 

3621 except Exception: 

3622 with excutils.save_and_reraise_exception(): 

3623 LOG.exception('Failed to detach volume ' 

3624 '%(volume_id)s from %(mp)s', 

3625 {'volume_id': volume_id, 'mp': mp}, 

3626 instance=instance) 

3627 self.volume_api.roll_detaching(context, volume_id) 

3628 

3629 def _rebuild_volume_backed_instance(self, context, instance, bdms, 

3630 image_id): 

3631 # Get root bdm and attachment ID associated to it 

3632 root_bdm = compute_utils.get_root_bdm(context, instance, bdms) 

3633 old_attachment_id = root_bdm.attachment_id 

3634 

3635 # Create a new attachment and delete the previous attachment 

3636 # We create a new attachment first to keep the volume in 

3637 # reserved state after old attachment is deleted and avoid any 

3638 # races in between the attachment create and delete. 

3639 attachment_id = None 

3640 try: 

3641 attachment_id = self.volume_api.attachment_create( 

3642 context, root_bdm.volume_id, instance.uuid)['id'] 

3643 self._detach_root_volume(context, instance, root_bdm) 

3644 root_bdm.attachment_id = attachment_id 

3645 root_bdm.save() 

3646 self.volume_api.attachment_delete(context, 

3647 old_attachment_id) 

3648 except exception.InstanceNotFound: 

3649 # This means we failed to save the new attachment because 

3650 # the instance is deleted, so (try to) delete it and abort. 

3651 try: 

3652 self.volume_api.attachment_delete(context, 

3653 attachment_id) 

3654 except cinder_exception.ClientException: 

3655 LOG.error('Failed to delete new attachment %s', 

3656 attachment_id) 

3657 msg = _('Failed to rebuild volume backed instance.') 

3658 raise exception.BuildAbortException( 

3659 instance_uuid=instance.uuid, reason=msg) 

3660 except cinder_exception.ClientException: 

3661 if attachment_id: 

3662 LOG.error('Failed to delete old attachment %s', 

3663 old_attachment_id) 

3664 else: 

3665 LOG.error('Failed to create new attachment') 

3666 msg = _('Failed to rebuild volume backed instance.') 

3667 raise exception.BuildAbortException( 

3668 instance_uuid=instance.uuid, reason=msg) 

3669 events = [('volume-reimaged', root_bdm.volume_id)] 

3670 

3671 # Get the image requested for rebuild 

3672 try: 

3673 image = self.image_api.get(context, image_id) 

3674 except exception.ImageNotFound: 

3675 msg = _('Image %s not found.') % image_id 

3676 LOG.error(msg) 

3677 raise exception.BuildAbortException( 

3678 instance_uuid=instance.uuid, reason=msg) 

3679 image_size = int(math.ceil(float(image.get('size')) / units.Gi)) 

3680 deadline = CONF.reimage_timeout_per_gb * image_size 

3681 error_cb = self._reimage_failed_callback 

3682 

3683 # Call cinder to perform reimage operation and wait until an 

3684 # external event is triggered. 

3685 try: 

3686 with self.virtapi.wait_for_instance_event(instance, events, 

3687 deadline=deadline, 

3688 error_callback=error_cb): 

3689 self.volume_api.reimage_volume( 

3690 context, root_bdm.volume_id, image_id, 

3691 reimage_reserved=True) 

3692 

3693 except Exception as ex: 

3694 LOG.error('Failed to rebuild volume backed instance: %s', 

3695 str(ex), instance=instance) 

3696 msg = _('Failed to rebuild volume backed instance.') 

3697 raise exception.BuildAbortException( 

3698 instance_uuid=instance.uuid, reason=msg) 

3699 

3700 def _rebuild_default_impl( 

3701 self, context, instance, image_meta, injected_files, 

3702 admin_password, allocations, bdms, detach_block_devices, 

3703 attach_block_devices, network_info=None, evacuate=False, 

3704 block_device_info=None, preserve_ephemeral=False, 

3705 accel_uuids=None, reimage_boot_volume=False): 

3706 if preserve_ephemeral: 3706 ↛ 3709line 3706 didn't jump to line 3709 because the condition on line 3706 was never true

3707 # The default code path does not support preserving ephemeral 

3708 # partitions. 

3709 raise exception.PreserveEphemeralNotSupported() 

3710 

3711 accel_info = [] 

3712 detach_root_bdm = not reimage_boot_volume 

3713 if evacuate: 

3714 if instance.flavor.extra_specs.get('accel:device_profile'): 3714 ↛ 3715line 3714 didn't jump to line 3715 because the condition on line 3714 was never true

3715 try: 

3716 accel_info = self._get_bound_arq_resources( 

3717 context, instance, accel_uuids or []) 

3718 except (Exception, eventlet.timeout.Timeout) as exc: 

3719 LOG.exception(exc) 

3720 self._build_resources_cleanup(instance, network_info) 

3721 msg = _('Failure getting accelerator resources.') 

3722 raise exception.BuildAbortException( 

3723 instance_uuid=instance.uuid, reason=msg) 

3724 detach_block_devices(context, bdms, 

3725 detach_root_bdm=detach_root_bdm) 

3726 else: 

3727 self._power_off_instance(context, instance, clean_shutdown=True) 

3728 detach_block_devices(context, bdms, 

3729 detach_root_bdm=detach_root_bdm) 

3730 if reimage_boot_volume: 

3731 # Previously, the calls reaching here were for image 

3732 # backed instance rebuild and didn't have a root bdm 

3733 # so now we need to handle the case for root bdm. 

3734 # For the root BDM, we are doing attach/detach operations 

3735 # manually as we want to maintain a 'reserved' state 

3736 # throughout the reimage process from the cinder side so 

3737 # we are excluding the root BDM from certain operations 

3738 # here i.e. deleting it's mapping before the destroy call. 

3739 block_device_info_copy = copy.deepcopy(block_device_info) 

3740 root_bdm = compute_utils.get_root_bdm(context, instance, bdms) 

3741 mapping = block_device_info_copy["block_device_mapping"] 

3742 # drop root bdm from the mapping 

3743 mapping = [ 

3744 bdm for bdm in mapping 

3745 if bdm["volume_id"] != root_bdm.volume_id 

3746 ] 

3747 self.driver.destroy(context, instance, 

3748 network_info=network_info, 

3749 block_device_info=block_device_info_copy) 

3750 else: 

3751 self.driver.destroy(context, instance, 

3752 network_info=network_info, 

3753 block_device_info=block_device_info) 

3754 try: 

3755 accel_info = self._get_accel_info(context, instance) 

3756 except Exception as exc: 

3757 LOG.exception(exc) 

3758 self._build_resources_cleanup(instance, network_info) 

3759 msg = _('Failure getting accelerator resources.') 

3760 raise exception.BuildAbortException( 

3761 instance_uuid=instance.uuid, reason=msg) 

3762 if reimage_boot_volume: 

3763 is_volume_backed = compute_utils.is_volume_backed_instance( 

3764 context, instance, bdms) 

3765 if is_volume_backed: 

3766 self._rebuild_volume_backed_instance( 

3767 context, instance, bdms, image_meta.id) 

3768 

3769 instance.task_state = task_states.REBUILD_BLOCK_DEVICE_MAPPING 

3770 instance.save(expected_task_state=[task_states.REBUILDING]) 

3771 

3772 new_block_device_info = attach_block_devices(context, instance, bdms) 

3773 

3774 instance.task_state = task_states.REBUILD_SPAWNING 

3775 instance.save( 

3776 expected_task_state=[task_states.REBUILD_BLOCK_DEVICE_MAPPING]) 

3777 

3778 with instance.mutated_migration_context(): 

3779 self.driver.spawn(context, instance, image_meta, injected_files, 

3780 admin_password, allocations, 

3781 network_info=network_info, 

3782 block_device_info=new_block_device_info, 

3783 accel_info=accel_info) 

3784 

3785 def _notify_instance_rebuild_error(self, context, instance, error, bdms): 

3786 self._notify_about_instance_usage(context, instance, 

3787 'rebuild.error', fault=error) 

3788 compute_utils.notify_about_instance_rebuild( 

3789 context, instance, self.host, 

3790 phase=fields.NotificationPhase.ERROR, exception=error, bdms=bdms) 

3791 

3792 @messaging.expected_exceptions(exception.PreserveEphemeralNotSupported, 

3793 exception.BuildAbortException) 

3794 @wrap_exception() 

3795 @reverts_task_state 

3796 @wrap_instance_event(prefix='compute') 

3797 @wrap_instance_fault 

3798 def rebuild_instance(self, context, instance, orig_image_ref, image_ref, 

3799 injected_files, new_pass, orig_sys_metadata, 

3800 bdms, recreate, on_shared_storage, 

3801 preserve_ephemeral, migration, 

3802 scheduled_node, limits, request_spec, accel_uuids, 

3803 reimage_boot_volume=None, target_state=None): 

3804 """Destroy and re-make this instance. 

3805 

3806 A 'rebuild' effectively purges all existing data from the system and 

3807 remakes the VM with given 'metadata' and 'personalities'. 

3808 

3809 :param context: `nova.RequestContext` object 

3810 :param instance: Instance object 

3811 :param orig_image_ref: Original image_ref before rebuild 

3812 :param image_ref: New image_ref for rebuild 

3813 :param injected_files: Files to inject 

3814 :param new_pass: password to set on rebuilt instance 

3815 :param orig_sys_metadata: instance system metadata from pre-rebuild 

3816 :param bdms: block-device-mappings to use for rebuild 

3817 :param recreate: True if the instance is being evacuated (e.g. the 

3818 hypervisor it was on failed) - cleanup of old state will be 

3819 skipped. 

3820 :param on_shared_storage: True if instance files on shared storage. 

3821 If not provided then information from the 

3822 driver will be used to decide if the instance 

3823 files are available or not on the target host 

3824 :param preserve_ephemeral: True if the default ephemeral storage 

3825 partition must be preserved on rebuild 

3826 :param migration: a Migration object if one was created for this 

3827 rebuild operation (if it's a part of evacuate) 

3828 :param scheduled_node: A node of the host chosen by the scheduler. If a 

3829 host was specified by the user, this will be 

3830 None 

3831 :param limits: Overcommit limits set by the scheduler. If a host was 

3832 specified by the user, this will be None 

3833 :param request_spec: a RequestSpec object used to schedule the instance 

3834 :param accel_uuids: a list of cyborg ARQ uuids 

3835 :param reimage_boot_volume: Boolean to specify whether the user has 

3836 explicitly requested to rebuild a boot 

3837 volume or None if RPC version is <=6.0 

3838 :param target_state: Set a target state for the evacuated instance or 

3839 None if RPC version is <=6.1. 

3840 

3841 """ 

3842 # recreate=True means the instance is being evacuated from a failed 

3843 # host to a new destination host (this host). The 'recreate' variable 

3844 # name is confusing, so rename it to evacuate here at the top, which 

3845 # is simpler than renaming a parameter in an RPC versioned method. 

3846 evacuate = recreate 

3847 context = context.elevated() 

3848 

3849 if evacuate: 

3850 LOG.info("Evacuating instance", instance=instance) 

3851 else: 

3852 LOG.info("Rebuilding instance", instance=instance) 

3853 

3854 if evacuate: 

3855 # This is an evacuation to a new host, so we need to perform a 

3856 # resource claim. 

3857 rebuild_claim = self.rt.rebuild_claim 

3858 else: 

3859 # This is a rebuild to the same host, so we don't need to make 

3860 # a claim since the instance is already on this host. 

3861 rebuild_claim = claims.NopClaim 

3862 

3863 if image_ref: 

3864 image_meta = objects.ImageMeta.from_image_ref( 

3865 context, self.image_api, image_ref) 

3866 elif evacuate: 

3867 # For evacuate the API does not send down the image_ref since the 

3868 # image does not change so just get it from what was stashed in 

3869 # the instance system_metadata when the instance was created (or 

3870 # last rebuilt). This also works for volume-backed instances. 

3871 image_meta = instance.image_meta 

3872 else: 

3873 image_meta = objects.ImageMeta() 

3874 

3875 # NOTE(mriedem): On an evacuate, we need to update 

3876 # the instance's host and node properties to reflect it's 

3877 # destination node for the evacuate. 

3878 if not scheduled_node: 

3879 if evacuate: 

3880 try: 

3881 compute_node = self._get_compute_info(context, self.host) 

3882 scheduled_node = compute_node.hypervisor_hostname 

3883 except exception.ComputeHostNotFound as e: 

3884 # This means we were asked to rebuild one of our own 

3885 # instances, or another instance as a target of an 

3886 # evacuation, but we are unable to find a matching compute 

3887 # node. 

3888 LOG.exception('Failed to get compute_info for %s', 

3889 self.host) 

3890 self._set_migration_status(migration, 'failed') 

3891 self._notify_instance_rebuild_error(context, instance, e, 

3892 bdms) 

3893 raise exception.InstanceFaultRollback( 

3894 inner_exception=exception.BuildAbortException( 

3895 instance_uuid=instance.uuid, 

3896 reason=e.format_message())) 

3897 

3898 else: 

3899 scheduled_node = instance.node 

3900 

3901 allocs = self.reportclient.get_allocations_for_consumer( 

3902 context, instance.uuid) 

3903 

3904 # If the resource claim or group policy validation fails before we 

3905 # do anything to the guest or its networking/volumes we want to keep 

3906 # the current status rather than put the instance into ERROR status. 

3907 instance_state = instance.vm_state 

3908 with self._error_out_instance_on_exception( 

3909 context, instance, instance_state=instance_state): 

3910 try: 

3911 self._do_rebuild_instance_with_claim( 

3912 context, instance, orig_image_ref, 

3913 image_meta, injected_files, new_pass, orig_sys_metadata, 

3914 bdms, evacuate, on_shared_storage, preserve_ephemeral, 

3915 migration, request_spec, allocs, rebuild_claim, 

3916 scheduled_node, limits, accel_uuids, reimage_boot_volume, 

3917 target_state) 

3918 except (exception.ComputeResourcesUnavailable, 

3919 exception.RescheduledException) as e: 

3920 if isinstance(e, exception.ComputeResourcesUnavailable): 

3921 LOG.debug("Could not rebuild instance on this host, not " 

3922 "enough resources available.", instance=instance) 

3923 else: 

3924 # RescheduledException is raised by the late server group 

3925 # policy check during evacuation if a parallel scheduling 

3926 # violated the policy. 

3927 # We catch the RescheduledException here but we don't have 

3928 # the plumbing to do an actual reschedule so we abort the 

3929 # operation. 

3930 LOG.debug("Could not rebuild instance on this host, " 

3931 "late server group check failed.", 

3932 instance=instance) 

3933 # NOTE(ndipanov): We just abort the build for now and leave a 

3934 # migration record for potential cleanup later 

3935 self._set_migration_status(migration, 'failed') 

3936 # Since the claim failed, we need to remove the allocation 

3937 # created against the destination node. Note that we can only 

3938 # get here when evacuating to a destination node. Rebuilding 

3939 # on the same host (not evacuate) uses the NopClaim which will 

3940 # not raise ComputeResourcesUnavailable. 

3941 self.rt.delete_allocation_for_evacuated_instance( 

3942 context, instance, scheduled_node, node_type='destination') 

3943 self._notify_instance_rebuild_error(context, instance, e, bdms) 

3944 # Wrap this in InstanceFaultRollback so that the 

3945 # _error_out_instance_on_exception context manager keeps the 

3946 # vm_state unchanged. 

3947 raise exception.InstanceFaultRollback( 

3948 inner_exception=exception.BuildAbortException( 

3949 instance_uuid=instance.uuid, 

3950 reason=e.format_message())) 

3951 except (exception.InstanceNotFound, 

3952 exception.UnexpectedDeletingTaskStateError) as e: 

3953 LOG.debug('Instance was deleted while rebuilding', 

3954 instance=instance) 

3955 self._set_migration_status(migration, 'failed') 

3956 self._notify_instance_rebuild_error(context, instance, e, bdms) 

3957 except Exception as e: 

3958 self._set_migration_status(migration, 'failed') 

3959 if evacuate or scheduled_node is not None: 

3960 self.rt.delete_allocation_for_evacuated_instance( 

3961 context, instance, scheduled_node, 

3962 node_type='destination') 

3963 self._notify_instance_rebuild_error(context, instance, e, bdms) 

3964 raise 

3965 else: 

3966 # NOTE(gibi): Let the resource tracker set the instance 

3967 # host and drop the migration context as we need to hold the 

3968 # COMPUTE_RESOURCE_SEMAPHORE to avoid the race with 

3969 # _update_available_resources. See bug 1896463. 

3970 self.rt.finish_evacuation(instance, scheduled_node, migration) 

3971 

3972 def _do_rebuild_instance_with_claim( 

3973 self, context, instance, orig_image_ref, image_meta, 

3974 injected_files, new_pass, orig_sys_metadata, bdms, evacuate, 

3975 on_shared_storage, preserve_ephemeral, migration, request_spec, 

3976 allocations, rebuild_claim, scheduled_node, limits, accel_uuids, 

3977 reimage_boot_volume, target_state): 

3978 """Helper to avoid deep nesting in the top-level method.""" 

3979 

3980 provider_mapping = None 

3981 if evacuate: 

3982 provider_mapping = self._get_request_group_mapping(request_spec) 

3983 

3984 if provider_mapping: 

3985 compute_utils.update_pci_request_with_placement_allocations( 

3986 context, 

3987 self.reportclient, 

3988 instance.pci_requests.requests, 

3989 provider_mapping, 

3990 ) 

3991 

3992 claim_context = rebuild_claim( 

3993 context, instance, scheduled_node, allocations, 

3994 limits=limits, image_meta=image_meta, migration=migration) 

3995 

3996 with claim_context: 

3997 self._do_rebuild_instance( 

3998 context, instance, orig_image_ref, image_meta, injected_files, 

3999 new_pass, orig_sys_metadata, bdms, evacuate, on_shared_storage, 

4000 preserve_ephemeral, migration, request_spec, allocations, 

4001 provider_mapping, accel_uuids, reimage_boot_volume, 

4002 target_state) 

4003 

4004 @staticmethod 

4005 def _get_image_name(image_meta): 

4006 if image_meta.obj_attr_is_set("name"): 

4007 return image_meta.name 

4008 else: 

4009 return '' 

4010 

4011 def _do_rebuild_instance( 

4012 self, context, instance, orig_image_ref, image_meta, 

4013 injected_files, new_pass, orig_sys_metadata, bdms, evacuate, 

4014 on_shared_storage, preserve_ephemeral, migration, request_spec, 

4015 allocations, request_group_resource_providers_mapping, 

4016 accel_uuids, reimage_boot_volume, target_state): 

4017 orig_vm_state = instance.vm_state 

4018 

4019 if evacuate: 

4020 if target_state and orig_vm_state != vm_states.ERROR: 4020 ↛ 4023line 4020 didn't jump to line 4023 because the condition on line 4020 was never true

4021 # This will ensure that at destination the instance will have 

4022 # the desired state. 

4023 if target_state not in vm_states.ALLOW_TARGET_STATES: 

4024 raise exception.InstanceEvacuateNotSupportedTargetState( 

4025 target_state=target_state) 

4026 orig_vm_state = target_state 

4027 

4028 if request_spec: 

4029 # NOTE(gibi): Do a late check of server group policy as 

4030 # parallel scheduling could violate such policy. This will 

4031 # cause the evacuate to fail as rebuild does not implement 

4032 # reschedule. 

4033 hints = self._get_scheduler_hints({}, request_spec) 

4034 self._validate_instance_group_policy(context, instance, hints) 

4035 

4036 if not self.driver.capabilities.get("supports_evacuate", False): 

4037 raise exception.InstanceEvacuateNotSupported 

4038 

4039 self._check_instance_exists(instance) 

4040 

4041 if on_shared_storage is None: 

4042 LOG.debug('on_shared_storage is not provided, using driver ' 

4043 'information to decide if the instance needs to ' 

4044 'be evacuated') 

4045 on_shared_storage = self.driver.instance_on_disk(instance) 

4046 

4047 elif (on_shared_storage != 

4048 self.driver.instance_on_disk(instance)): 

4049 # To cover case when admin expects that instance files are 

4050 # on shared storage, but not accessible and vice versa 

4051 raise exception.InvalidSharedStorage( 

4052 _("Invalid state of instance files on shared" 

4053 " storage")) 

4054 

4055 if on_shared_storage: 

4056 LOG.info('disk on shared storage, evacuating using' 

4057 ' existing disk') 

4058 elif instance.image_ref: 

4059 orig_image_ref = instance.image_ref 

4060 LOG.info("disk not on shared storage, evacuating from " 

4061 "image: '%s'", str(orig_image_ref)) 

4062 else: 

4063 LOG.info('disk on volume, evacuating using existing ' 

4064 'volume') 

4065 

4066 # We check trusted certs capabilities for both evacuate (rebuild on 

4067 # another host) and rebuild (rebuild on the same host) because for 

4068 # evacuate we need to make sure an instance with trusted certs can 

4069 # have the image verified with those certs during rebuild, and for 

4070 # rebuild we could be rebuilding a server that started out with no 

4071 # trusted certs on this host, and then was rebuilt with trusted certs 

4072 # for a new image, in which case we need to validate that new image 

4073 # with the trusted certs during the rebuild. 

4074 self._check_trusted_certs(instance) 

4075 

4076 # This instance.exists message should contain the original 

4077 # image_ref, not the new one. Since the DB has been updated 

4078 # to point to the new one... we have to override it. 

4079 orig_image_ref_url = self.image_api.generate_image_url(orig_image_ref, 

4080 context) 

4081 extra_usage_info = {'image_ref_url': orig_image_ref_url} 

4082 compute_utils.notify_usage_exists( 

4083 self.notifier, context, instance, self.host, 

4084 current_period=True, system_metadata=orig_sys_metadata, 

4085 extra_usage_info=extra_usage_info) 

4086 

4087 # This message should contain the new image_ref 

4088 extra_usage_info = {'image_name': self._get_image_name(image_meta)} 

4089 self._notify_about_instance_usage(context, instance, 

4090 "rebuild.start", extra_usage_info=extra_usage_info) 

4091 # NOTE: image_name is not included in the versioned notification 

4092 # because we already provide the image_uuid in the notification 

4093 # payload and the image details can be looked up via the uuid. 

4094 compute_utils.notify_about_instance_rebuild( 

4095 context, instance, self.host, 

4096 phase=fields.NotificationPhase.START, 

4097 bdms=bdms) 

4098 

4099 instance.power_state = self._get_power_state(instance) 

4100 instance.task_state = task_states.REBUILDING 

4101 instance.save(expected_task_state=[task_states.REBUILDING]) 

4102 

4103 if evacuate: 

4104 self.network_api.setup_networks_on_host( 

4105 context, instance, self.host) 

4106 # For nova-network this is needed to move floating IPs 

4107 # For neutron this updates the host in the port binding 

4108 # TODO(cfriesen): this network_api call and the one above 

4109 # are so similar, we should really try to unify them. 

4110 self.network_api.setup_instance_network_on_host( 

4111 context, instance, self.host, migration, 

4112 provider_mappings=request_group_resource_providers_mapping) 

4113 # TODO(mriedem): Consider decorating setup_instance_network_on_host 

4114 # with @api.refresh_cache and then we wouldn't need this explicit 

4115 # call to get_instance_nw_info. 

4116 network_info = self.network_api.get_instance_nw_info(context, 

4117 instance) 

4118 else: 

4119 network_info = instance.get_network_info() 

4120 

4121 if bdms is None: 

4122 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

4123 context, instance.uuid) 

4124 

4125 block_device_info = \ 

4126 self._get_instance_block_device_info( 

4127 context, instance, bdms=bdms) 

4128 

4129 def detach_block_devices(context, bdms, detach_root_bdm=True): 

4130 for bdm in bdms: 

4131 # Previously, the calls made to this method by rebuild 

4132 # instance operation were for image backed instances which 

4133 # assumed we only had attached volumes and no root BDM. 

4134 # Now we need to handle case for root BDM which we are 

4135 # doing manually so skipping the attachment create/delete 

4136 # calls from here. 

4137 # The detach_root_bdm parameter is only passed while 

4138 # rebuilding the volume backed instance so we don't have 

4139 # to worry about other callers as they won't satisfy this 

4140 # condition. 

4141 # For evacuate case, we have detach_root_bdm always True 

4142 # since we don't have reimage_boot_volume parameter in 

4143 # this case so this will not be executed. 

4144 if not detach_root_bdm and bdm.is_root: 4144 ↛ 4145line 4144 didn't jump to line 4145 because the condition on line 4144 was never true

4145 continue 

4146 if bdm.is_volume: 4146 ↛ 4130line 4146 didn't jump to line 4130 because the condition on line 4146 was always true

4147 # NOTE (ildikov): Having the attachment_id set in the BDM 

4148 # means that it's the new Cinder attach/detach flow 

4149 # (available from v3.44). In that case we explicitly 

4150 # attach and detach the volumes through attachment level 

4151 # operations. In this scenario _detach_volume will delete 

4152 # the existing attachment which would make the volume 

4153 # status change to 'available' if we don't pre-create 

4154 # another empty attachment before deleting the old one. 

4155 attachment_id = None 

4156 if bdm.attachment_id: 4156 ↛ 4157line 4156 didn't jump to line 4157 because the condition on line 4156 was never true

4157 attachment_id = self.volume_api.attachment_create( 

4158 context, bdm['volume_id'], instance.uuid)['id'] 

4159 self._detach_volume(context, bdm, instance, 

4160 destroy_bdm=False) 

4161 if attachment_id: 4161 ↛ 4162line 4161 didn't jump to line 4162 because the condition on line 4161 was never true

4162 bdm.attachment_id = attachment_id 

4163 bdm.save() 

4164 

4165 files = self._decode_files(injected_files) 

4166 

4167 kwargs = dict( 

4168 context=context, 

4169 instance=instance, 

4170 image_meta=image_meta, 

4171 injected_files=files, 

4172 admin_password=new_pass, 

4173 allocations=allocations, 

4174 bdms=bdms, 

4175 detach_block_devices=detach_block_devices, 

4176 attach_block_devices=self._prep_block_device, 

4177 block_device_info=block_device_info, 

4178 network_info=network_info, 

4179 preserve_ephemeral=preserve_ephemeral, 

4180 evacuate=evacuate, 

4181 accel_uuids=accel_uuids, 

4182 reimage_boot_volume=reimage_boot_volume) 

4183 try: 

4184 with instance.mutated_migration_context(): 

4185 self.driver.rebuild(**kwargs) 

4186 except NotImplementedError: 

4187 # NOTE(rpodolyaka): driver doesn't provide specialized version 

4188 # of rebuild, fall back to the default implementation 

4189 self._rebuild_default_impl(**kwargs) 

4190 self._update_instance_after_spawn(instance) 

4191 instance.save(expected_task_state=[task_states.REBUILD_SPAWNING]) 

4192 

4193 if orig_vm_state == vm_states.STOPPED: 

4194 LOG.info("bringing vm to original state: '%s'", 

4195 orig_vm_state, instance=instance) 

4196 instance.vm_state = vm_states.ACTIVE 

4197 instance.task_state = task_states.POWERING_OFF 

4198 instance.progress = 0 

4199 instance.save() 

4200 self.stop_instance(context, instance, False) 

4201 # TODO(melwitt): We should clean up instance console tokens here in the 

4202 # case of evacuate. The instance is on a new host and will need to 

4203 # establish a new console connection. 

4204 self._update_scheduler_instance_info(context, instance) 

4205 self._notify_about_instance_usage( 

4206 context, instance, "rebuild.end", 

4207 network_info=network_info, 

4208 extra_usage_info=extra_usage_info) 

4209 compute_utils.notify_about_instance_rebuild( 

4210 context, instance, self.host, 

4211 phase=fields.NotificationPhase.END, 

4212 bdms=bdms) 

4213 

4214 def _handle_bad_volumes_detached(self, context, instance, bad_devices, 

4215 block_device_info): 

4216 """Handle cases where the virt-layer had to detach non-working volumes 

4217 in order to complete an operation. 

4218 """ 

4219 for bdm in block_device_info['block_device_mapping']: 

4220 if bdm.get('mount_device') in bad_devices: 

4221 try: 

4222 volume_id = bdm['connection_info']['data']['volume_id'] 

4223 except KeyError: 

4224 continue 

4225 

4226 # NOTE(sirp): ideally we'd just call 

4227 # `compute_api.detach_volume` here but since that hits the 

4228 # DB directly, that's off limits from within the 

4229 # compute-manager. 

4230 # 

4231 # API-detach 

4232 LOG.info("Detaching from volume api: %s", volume_id) 

4233 self.volume_api.begin_detaching(context, volume_id) 

4234 

4235 # Manager-detach 

4236 self.detach_volume(context, volume_id, instance) 

4237 

4238 def _get_accel_info(self, context, instance): 

4239 dp_name = instance.flavor.extra_specs.get('accel:device_profile') 

4240 if dp_name: 

4241 cyclient = cyborg.get_client(context) 

4242 accel_info = cyclient.get_arqs_for_instance(instance.uuid) 

4243 else: 

4244 accel_info = [] 

4245 return accel_info 

4246 

4247 def _delete_dangling_bdms(self, context, instance, bdms): 

4248 """Deletes dangling or stale attachments for volume from 

4249 Nova and Cinder DB so both service DBs can be in sync. 

4250 

4251 Retrieves volume attachments from the Nova block_device_mapping 

4252 table and verifies them with the Cinder volume_attachment table. 

4253 If attachment is not present in any one of the DBs, delete 

4254 attachments from the other DB. 

4255 

4256 :param context: The nova request context. 

4257 :param instance: instance object. 

4258 :param bdms: BlockDeviceMappingList list object. 

4259 """ 

4260 

4261 try: 

4262 cinder_attachments = self.volume_api.attachment_get_all( 

4263 context, instance.uuid) 

4264 except (keystone_exception.EndpointNotFound, 

4265 cinder_exception.ClientException): 

4266 # if cinder is not deployed we never need to check for 

4267 # attachments as there cannot be dangling bdms. 

4268 # if we cannot connect to cinder we cannot check for dangling 

4269 # bdms so we skip the check. Intermittent connection issues 

4270 # to cinder should not cause instance reboot to fail. 

4271 return 

4272 

4273 # attachments present in nova DB, ones nova knows about 

4274 nova_attachments = [] 

4275 bdms_to_delete = [] 

4276 for bdm in bdms.objects: 

4277 if bdm.volume_id and bdm.attachment_id: 4277 ↛ 4276line 4277 didn't jump to line 4276 because the condition on line 4277 was always true

4278 try: 

4279 self.volume_api.attachment_get(context, bdm.attachment_id) 

4280 except exception.VolumeAttachmentNotFound: 

4281 LOG.info( 

4282 f"Removing stale volume attachment " 

4283 f"'{bdm.attachment_id}' from instance for " 

4284 f"volume '{bdm.volume_id}'.", instance=instance) 

4285 bdm.destroy() 

4286 bdms_to_delete.append(bdm) 

4287 else: 

4288 nova_attachments.append(bdm.attachment_id) 

4289 

4290 cinder_attachments = [each['id'] for each in cinder_attachments] 

4291 

4292 if len(set(cinder_attachments) - set(nova_attachments)): 

4293 LOG.info( 

4294 "Removing stale volume attachments of instance from " 

4295 "Cinder", instance=instance) 

4296 for each_attach in set(cinder_attachments) - set(nova_attachments): 

4297 # delete only cinder known attachments, from cinder DB. 

4298 LOG.debug( 

4299 f"Removing attachment '{each_attach}'", instance=instance) 

4300 self.volume_api.attachment_delete(context, each_attach) 

4301 

4302 # refresh bdms object 

4303 for bdm in bdms_to_delete: 

4304 bdms.objects.remove(bdm) 

4305 

4306 def _get_share_info(self, context, instance, check_status=True): 

4307 share_info = objects.ShareMappingList(context) 

4308 

4309 for share_mapping in objects.ShareMappingList.get_by_instance_uuid( 

4310 context, instance.uuid 

4311 ): 

4312 share_info.objects.append(share_mapping) 

4313 

4314 if check_status: 

4315 fsm = fields.ShareMappingStatus 

4316 if ( 

4317 share_mapping.status == fsm.ATTACHING or 

4318 share_mapping.status == fsm.DETACHING 

4319 ): 

4320 # If the share status is attaching it means we are racing 

4321 # with the compute node. The mount is not completed yet or 

4322 # something really bad happened. So we set the instance in 

4323 # error state. 

4324 LOG.error( 

4325 "Share id '%s' attached to server id '%s' is " 

4326 "still in '%s' state. Setting the instance " 

4327 "in error.", 

4328 share_mapping.share_id, 

4329 instance.id, 

4330 share_mapping.status, 

4331 ) 

4332 self._set_instance_obj_error_state( 

4333 instance, clean_task_state=True 

4334 ) 

4335 raise exception.ShareErrorUnexpectedStatus( 

4336 share_id=share_mapping.share_id, 

4337 instance_uuid=instance.id, 

4338 ) 

4339 

4340 if share_mapping.status == fsm.ERROR: 

4341 LOG.warning( 

4342 "Share id '%s' attached to server id '%s' is in " 

4343 "error state.", 

4344 share_mapping.share_id, 

4345 instance.id 

4346 ) 

4347 

4348 return share_info 

4349 

4350 @wrap_exception() 

4351 @reverts_task_state 

4352 @wrap_instance_event(prefix='compute') 

4353 @wrap_instance_fault 

4354 def reboot_instance(self, context, instance, block_device_info, 

4355 reboot_type): 

4356 @utils.synchronized(instance.uuid) 

4357 def do_reboot_instance(context, instance, block_device_info, 

4358 reboot_type): 

4359 self._reboot_instance(context, instance, block_device_info, 

4360 reboot_type) 

4361 do_reboot_instance(context, instance, block_device_info, reboot_type) 

4362 

4363 def _reboot_instance(self, context, instance, block_device_info, 

4364 reboot_type): 

4365 """Reboot an instance on this host.""" 

4366 # acknowledge the request made it to the manager 

4367 if reboot_type == "SOFT": 

4368 instance.task_state = task_states.REBOOT_PENDING 

4369 expected_states = task_states.soft_reboot_states 

4370 else: 

4371 instance.task_state = task_states.REBOOT_PENDING_HARD 

4372 expected_states = task_states.hard_reboot_states 

4373 

4374 context = context.elevated() 

4375 LOG.info("Rebooting instance", instance=instance) 

4376 

4377 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

4378 context, instance.uuid) 

4379 

4380 self._delete_dangling_bdms(context, instance, bdms) 

4381 

4382 block_device_info = self._get_instance_block_device_info( 

4383 context, instance, bdms=bdms) 

4384 

4385 network_info = self.network_api.get_instance_nw_info(context, instance) 

4386 

4387 accel_info = self._get_accel_info(context, instance) 

4388 

4389 share_info = self._get_share_info(context, instance) 

4390 

4391 self._notify_about_instance_usage(context, instance, "reboot.start") 

4392 compute_utils.notify_about_instance_action( 

4393 context, instance, self.host, 

4394 action=fields.NotificationAction.REBOOT, 

4395 phase=fields.NotificationPhase.START, 

4396 bdms=bdms 

4397 ) 

4398 

4399 instance.power_state = self._get_power_state(instance) 

4400 instance.save(expected_task_state=expected_states) 

4401 

4402 if instance.power_state != power_state.RUNNING: 

4403 state = instance.power_state 

4404 running = power_state.RUNNING 

4405 LOG.warning('trying to reboot a non-running instance:' 

4406 ' (state: %(state)s expected: %(running)s)', 

4407 {'state': state, 'running': running}, 

4408 instance=instance) 

4409 

4410 def bad_volumes_callback(bad_devices): 

4411 self._handle_bad_volumes_detached( 

4412 context, instance, bad_devices, block_device_info) 

4413 

4414 try: 

4415 # Don't change it out of rescue mode 

4416 if instance.vm_state == vm_states.RESCUED: 4416 ↛ 4417line 4416 didn't jump to line 4417 because the condition on line 4416 was never true

4417 new_vm_state = vm_states.RESCUED 

4418 else: 

4419 new_vm_state = vm_states.ACTIVE 

4420 new_power_state = None 

4421 if reboot_type == "SOFT": 

4422 instance.task_state = task_states.REBOOT_STARTED 

4423 expected_state = task_states.REBOOT_PENDING 

4424 else: 

4425 instance.task_state = task_states.REBOOT_STARTED_HARD 

4426 expected_state = task_states.REBOOT_PENDING_HARD 

4427 instance.save(expected_task_state=expected_state) 

4428 

4429 # Attempt to mount the shares again. 

4430 # Note: The API ref states that soft reboot can only be 

4431 # done if the instance is in ACTIVE state. If the instance 

4432 # is in ACTIVE state it cannot have a share_mapping in ERROR 

4433 # so it is safe to ignore the re-mounting of the share for 

4434 # soft reboot. 

4435 if reboot_type == "HARD": 

4436 self._mount_all_shares(context, instance, share_info) 

4437 

4438 self.driver.reboot(context, instance, 

4439 network_info, 

4440 reboot_type, 

4441 block_device_info=block_device_info, 

4442 accel_info=accel_info, 

4443 share_info=share_info, 

4444 bad_volumes_callback=bad_volumes_callback) 

4445 share_info.activate_all() 

4446 

4447 except Exception as error: 

4448 with excutils.save_and_reraise_exception() as ctxt: 

4449 exc_info = sys.exc_info() 

4450 # if the reboot failed but the VM is running don't 

4451 # put it into an error state 

4452 new_power_state = self._get_power_state(instance) 

4453 if new_power_state == power_state.RUNNING: 

4454 LOG.warning('Reboot failed but instance is running', 

4455 instance=instance) 

4456 compute_utils.add_instance_fault_from_exc(context, 

4457 instance, error, exc_info) 

4458 self._notify_about_instance_usage(context, instance, 

4459 'reboot.error', fault=error) 

4460 compute_utils.notify_about_instance_action( 

4461 context, instance, self.host, 

4462 action=fields.NotificationAction.REBOOT, 

4463 phase=fields.NotificationPhase.ERROR, 

4464 exception=error, bdms=bdms 

4465 ) 

4466 ctxt.reraise = False 

4467 else: 

4468 LOG.error('Cannot reboot instance: %s', error, 

4469 instance=instance) 

4470 self._set_instance_obj_error_state(instance) 

4471 

4472 if not new_power_state: 

4473 new_power_state = self._get_power_state(instance) 

4474 try: 

4475 instance.power_state = new_power_state 

4476 instance.vm_state = new_vm_state 

4477 instance.task_state = None 

4478 instance.save() 

4479 except exception.InstanceNotFound: 

4480 LOG.warning("Instance disappeared during reboot", 

4481 instance=instance) 

4482 

4483 self._notify_about_instance_usage(context, instance, "reboot.end") 

4484 compute_utils.notify_about_instance_action( 

4485 context, instance, self.host, 

4486 action=fields.NotificationAction.REBOOT, 

4487 phase=fields.NotificationPhase.END, 

4488 bdms=bdms 

4489 ) 

4490 

4491 @delete_image_on_error 

4492 def _do_snapshot_instance(self, context, image_id, instance): 

4493 self._snapshot_instance(context, image_id, instance, 

4494 task_states.IMAGE_BACKUP) 

4495 

4496 @wrap_exception() 

4497 @reverts_task_state 

4498 @wrap_instance_event(prefix='compute') 

4499 @wrap_instance_fault 

4500 def backup_instance(self, context, image_id, instance, backup_type, 

4501 rotation): 

4502 """Backup an instance on this host. 

4503 

4504 :param backup_type: daily | weekly 

4505 :param rotation: int representing how many backups to keep around 

4506 """ 

4507 self._do_snapshot_instance(context, image_id, instance) 

4508 self._rotate_backups(context, instance, backup_type, rotation) 

4509 

4510 @wrap_exception() 

4511 @reverts_task_state 

4512 @wrap_instance_event(prefix='compute') 

4513 @wrap_instance_fault 

4514 @delete_image_on_error 

4515 def snapshot_instance(self, context, image_id, instance): 

4516 """Snapshot an instance on this host. 

4517 

4518 :param context: security context 

4519 :param image_id: glance.db.sqlalchemy.models.Image.Id 

4520 :param instance: a nova.objects.instance.Instance object 

4521 """ 

4522 # NOTE(dave-mcnally) the task state will already be set by the api 

4523 # but if the compute manager has crashed/been restarted prior to the 

4524 # request getting here the task state may have been cleared so we set 

4525 # it again and things continue normally 

4526 try: 

4527 instance.task_state = task_states.IMAGE_SNAPSHOT 

4528 instance.save( 

4529 expected_task_state=task_states.IMAGE_SNAPSHOT_PENDING) 

4530 except exception.InstanceNotFound: 

4531 # possibility instance no longer exists, no point in continuing 

4532 LOG.debug("Instance not found, could not set state %s " 

4533 "for instance.", 

4534 task_states.IMAGE_SNAPSHOT, instance=instance) 

4535 return 

4536 

4537 except exception.UnexpectedDeletingTaskStateError: 

4538 LOG.debug("Instance being deleted, snapshot cannot continue", 

4539 instance=instance) 

4540 return 

4541 

4542 with self._snapshot_semaphore: 

4543 self._snapshot_instance(context, image_id, instance, 

4544 task_states.IMAGE_SNAPSHOT) 

4545 

4546 def _snapshot_instance(self, context, image_id, instance, 

4547 expected_task_state): 

4548 context = context.elevated() 

4549 

4550 instance.power_state = self._get_power_state(instance) 

4551 try: 

4552 instance.save() 

4553 

4554 LOG.info('instance snapshotting', instance=instance) 

4555 

4556 if instance.power_state != power_state.RUNNING: 4556 ↛ 4557line 4556 didn't jump to line 4557 because the condition on line 4556 was never true

4557 state = instance.power_state 

4558 running = power_state.RUNNING 

4559 LOG.warning('trying to snapshot a non-running instance: ' 

4560 '(state: %(state)s expected: %(running)s)', 

4561 {'state': state, 'running': running}, 

4562 instance=instance) 

4563 

4564 self._notify_about_instance_usage( 

4565 context, instance, "snapshot.start") 

4566 compute_utils.notify_about_instance_snapshot(context, instance, 

4567 self.host, phase=fields.NotificationPhase.START, 

4568 snapshot_image_id=image_id) 

4569 

4570 def update_task_state(task_state, 

4571 expected_state=expected_task_state): 

4572 instance.task_state = task_state 

4573 instance.save(expected_task_state=expected_state) 

4574 

4575 with timeutils.StopWatch() as timer: 

4576 self.driver.snapshot(context, instance, image_id, 

4577 update_task_state) 

4578 LOG.info('Took %0.2f seconds to snapshot the instance on ' 

4579 'the hypervisor.', timer.elapsed(), instance=instance) 

4580 

4581 instance.task_state = None 

4582 instance.save(expected_task_state=task_states.IMAGE_UPLOADING) 

4583 

4584 self._notify_about_instance_usage(context, instance, 

4585 "snapshot.end") 

4586 compute_utils.notify_about_instance_snapshot(context, instance, 

4587 self.host, phase=fields.NotificationPhase.END, 

4588 snapshot_image_id=image_id) 

4589 except (exception.InstanceNotFound, 

4590 exception.InstanceNotRunning, 

4591 exception.UnexpectedDeletingTaskStateError): 

4592 # the instance got deleted during the snapshot 

4593 # Quickly bail out of here 

4594 msg = 'Instance disappeared during snapshot' 

4595 LOG.debug(msg, instance=instance) 

4596 try: 

4597 image = self.image_api.get(context, image_id) 

4598 if image['status'] != 'active': 

4599 self.image_api.delete(context, image_id) 

4600 except exception.ImageNotFound: 

4601 LOG.debug('Image not found during clean up %s', image_id) 

4602 except Exception: 

4603 LOG.warning("Error while trying to clean up image %s", 

4604 image_id, instance=instance) 

4605 except exception.ImageNotFound: 

4606 instance.task_state = None 

4607 instance.save() 

4608 LOG.warning("Image not found during snapshot", instance=instance) 

4609 

4610 @messaging.expected_exceptions(NotImplementedError) 

4611 @wrap_exception() 

4612 def volume_snapshot_create(self, context, instance, volume_id, 

4613 create_info): 

4614 try: 

4615 self.driver.volume_snapshot_create(context, instance, volume_id, 

4616 create_info) 

4617 except exception.InstanceNotRunning: 

4618 # Libvirt driver can raise this exception 

4619 LOG.debug('Instance disappeared during volume snapshot create', 

4620 instance=instance) 

4621 

4622 @messaging.expected_exceptions(NotImplementedError) 

4623 @wrap_exception() 

4624 def volume_snapshot_delete(self, context, instance, volume_id, 

4625 snapshot_id, delete_info): 

4626 try: 

4627 self.driver.volume_snapshot_delete(context, instance, volume_id, 

4628 snapshot_id, delete_info) 

4629 except exception.InstanceNotRunning: 

4630 # Libvirt driver can raise this exception 

4631 LOG.debug('Instance disappeared during volume snapshot delete', 

4632 instance=instance) 

4633 

4634 @messaging.expected_exceptions(NotImplementedError) 

4635 @wrap_exception() 

4636 @wrap_instance_event(prefix='compute') 

4637 @wrap_instance_fault 

4638 def allow_share(self, context, instance, share_mapping): 

4639 

4640 @utils.synchronized(share_mapping.share_id) 

4641 def _allow_share(context, instance, share_mapping): 

4642 def _apply_policy(): 

4643 # self.manila_api.lock(share_mapping.share_id) 

4644 # Explicitly locking the share is not needed as 

4645 # create_access_rule() from the sdk will do it if the 

4646 # lock_visibility and lock_deletion flags are passed 

4647 self.manila_api.allow( 

4648 context, 

4649 share_mapping.share_id, 

4650 share_mapping.access_type, 

4651 share_mapping.access_to, 

4652 "rw", 

4653 ) 

4654 

4655 def _wait_policy_to_be_applied(): 

4656 # Ensure the share policy is updated, this will avoid 

4657 # a race condition mounting the share if it is not the case. 

4658 max_retries = CONF.manila.share_apply_policy_timeout 

4659 attempt_count = 0 

4660 while attempt_count < max_retries: 4660 ↛ 4680line 4660 didn't jump to line 4680 because the condition on line 4660 was always true

4661 if self.manila_api.has_access( 4661 ↛ 4673line 4661 didn't jump to line 4673 because the condition on line 4661 was always true

4662 context, 

4663 share_mapping.share_id, 

4664 share_mapping.access_type, 

4665 share_mapping.access_to, 

4666 ): 

4667 LOG.debug( 

4668 "Allow policy set on share %s ", 

4669 share_mapping.share_id, 

4670 ) 

4671 break 

4672 else: 

4673 LOG.debug( 

4674 "Waiting policy to be set on share %s ", 

4675 share_mapping.share_id, 

4676 ) 

4677 time.sleep(1) 

4678 attempt_count += 1 

4679 

4680 if attempt_count >= max_retries: 4680 ↛ 4681line 4680 didn't jump to line 4681 because the condition on line 4680 was never true

4681 raise exception.ShareAccessGrantError( 

4682 share_id=share_mapping.share_id, 

4683 reason="Failed to set allow policy on share, " 

4684 "too many retries", 

4685 ) 

4686 

4687 try: 

4688 compute_utils.notify_about_share_attach_detach( 

4689 context, 

4690 instance, 

4691 instance.host, 

4692 action=fields.NotificationAction.SHARE_ATTACH, 

4693 phase=fields.NotificationPhase.START, 

4694 share_id=share_mapping.share_id 

4695 ) 

4696 

4697 share_mapping.set_access_according_to_protocol() 

4698 

4699 if not self.manila_api.has_access( 4699 ↛ 4709line 4699 didn't jump to line 4709 because the condition on line 4699 was always true

4700 context, 

4701 share_mapping.share_id, 

4702 share_mapping.access_type, 

4703 share_mapping.access_to, 

4704 ): 

4705 _apply_policy() 

4706 _wait_policy_to_be_applied() 

4707 

4708 # Set the share from attaching to inactive 

4709 self._set_share_mapping_status( 

4710 share_mapping, fields.ShareMappingStatus.INACTIVE 

4711 ) 

4712 

4713 compute_utils.notify_about_share_attach_detach( 

4714 context, 

4715 instance, 

4716 instance.host, 

4717 action=fields.NotificationAction.SHARE_ATTACH, 

4718 phase=fields.NotificationPhase.END, 

4719 share_id=share_mapping.share_id 

4720 ) 

4721 

4722 except ( 

4723 exception.ShareNotFound, 

4724 exception.ShareProtocolNotSupported, 

4725 exception.ShareAccessGrantError, 

4726 ) as e: 

4727 self._set_share_mapping_status( 

4728 share_mapping, fields.ShareMappingStatus.ERROR 

4729 ) 

4730 compute_utils.notify_about_share_attach_detach( 

4731 context, 

4732 instance, 

4733 instance.host, 

4734 action=fields.NotificationAction.SHARE_ATTACH, 

4735 phase=fields.NotificationPhase.ERROR, 

4736 share_id=share_mapping.share_id, 

4737 exception=e 

4738 ) 

4739 LOG.error(e.format_message()) 

4740 raise 

4741 except ( 

4742 sdk_exc.BadRequestException, 

4743 ) as e: 

4744 self._set_share_mapping_status( 

4745 share_mapping, fields.ShareMappingStatus.ERROR 

4746 ) 

4747 compute_utils.notify_about_share_attach_detach( 

4748 context, 

4749 instance, 

4750 instance.host, 

4751 action=fields.NotificationAction.SHARE_ATTACH, 

4752 phase=fields.NotificationPhase.ERROR, 

4753 share_id=share_mapping.share_id, 

4754 exception=e 

4755 ) 

4756 LOG.error( 

4757 "%s: %s error from url: %s, %s", 

4758 e.message, 

4759 e.source, 

4760 e.url, 

4761 e.details, 

4762 ) 

4763 raise 

4764 except keystone_exception.http.Unauthorized as e: 

4765 self._set_share_mapping_status( 

4766 share_mapping, fields.ShareMappingStatus.ERROR 

4767 ) 

4768 compute_utils.notify_about_share_attach_detach( 

4769 context, 

4770 instance, 

4771 instance.host, 

4772 action=fields.NotificationAction.SHARE_ATTACH, 

4773 phase=fields.NotificationPhase.ERROR, 

4774 share_id=share_mapping.share_id, 

4775 exception=e 

4776 ) 

4777 LOG.error(e) 

4778 raise 

4779 

4780 _allow_share(context, instance, share_mapping) 

4781 

4782 @messaging.expected_exceptions(NotImplementedError) 

4783 @wrap_exception() 

4784 @wrap_instance_event(prefix='compute') 

4785 @wrap_instance_fault 

4786 def deny_share(self, context, instance, share_mapping): 

4787 

4788 @utils.synchronized(share_mapping.share_id) 

4789 def _deny_share(context, instance, share_mapping): 

4790 

4791 def check_share_usage(context, instance_uuid): 

4792 share_mappings_used_by_share = ( 

4793 objects.share_mapping.ShareMappingList.get_by_share_id( 

4794 context, share_mapping.share_id 

4795 ) 

4796 ) 

4797 

4798 # Logic explanation: 

4799 # 

4800 # Warning: Here we have a list of share_mapping using our 

4801 # share (usually share_mappings is a list of share_mapping used 

4802 # by an instance). 

4803 # A share IS NOT used (detachable) if: 

4804 # - The share status is INACTIVE or ERROR on our instance. 

4805 # - The share status is DETACHING on all other instances. 

4806 # +-- reverse the logic as the function check if a share 

4807 # | IS used. 

4808 # v 

4809 return not all( 

4810 ( 

4811 ( 

4812 sm.instance_uuid == instance_uuid and 

4813 ( 

4814 sm.status 

4815 in ( 

4816 fields.ShareMappingStatus.INACTIVE, 

4817 fields.ShareMappingStatus.ERROR, 

4818 ) 

4819 ) 

4820 ) or 

4821 sm.status == fields.ShareMappingStatus.DETACHING 

4822 ) 

4823 for sm in share_mappings_used_by_share 

4824 ) 

4825 

4826 try: 

4827 compute_utils.notify_about_share_attach_detach( 

4828 context, 

4829 instance, 

4830 instance.host, 

4831 action=fields.NotificationAction.SHARE_DETACH, 

4832 phase=fields.NotificationPhase.START, 

4833 share_id=share_mapping.share_id, 

4834 ) 

4835 

4836 still_used = check_share_usage(context, instance.uuid) 

4837 

4838 share_mapping.set_access_according_to_protocol() 

4839 

4840 if not still_used: 

4841 # self.manila_api.unlock(share_mapping.share_id) 

4842 # Explicit unlocking the share is not needed as 

4843 # delete_access_rule() from the sdk will do it if the 

4844 # "unrestrict" parameter is passed 

4845 self.manila_api.deny( 

4846 context, 

4847 share_mapping.share_id, 

4848 share_mapping.access_type, 

4849 share_mapping.access_to, 

4850 ) 

4851 

4852 share_mapping.delete() 

4853 

4854 compute_utils.notify_about_share_attach_detach( 

4855 context, 

4856 instance, 

4857 instance.host, 

4858 action=fields.NotificationAction.SHARE_DETACH, 

4859 phase=fields.NotificationPhase.END, 

4860 share_id=share_mapping.share_id, 

4861 ) 

4862 

4863 except ( 

4864 exception.ShareAccessRemovalError, 

4865 exception.ShareProtocolNotSupported, 

4866 ) as e: 

4867 self._set_share_mapping_status( 

4868 share_mapping, fields.ShareMappingStatus.ERROR 

4869 ) 

4870 compute_utils.notify_about_share_attach_detach( 

4871 context, 

4872 instance, 

4873 instance.host, 

4874 action=fields.NotificationAction.SHARE_DETACH, 

4875 phase=fields.NotificationPhase.ERROR, 

4876 share_id=share_mapping.share_id, 

4877 exception=e 

4878 ) 

4879 LOG.error(e.format_message()) 

4880 raise 

4881 except keystone_exception.http.Unauthorized as e: 

4882 self._set_share_mapping_status( 

4883 share_mapping, fields.ShareMappingStatus.ERROR 

4884 ) 

4885 compute_utils.notify_about_share_attach_detach( 

4886 context, 

4887 instance, 

4888 instance.host, 

4889 action=fields.NotificationAction.SHARE_DETACH, 

4890 phase=fields.NotificationPhase.ERROR, 

4891 share_id=share_mapping.share_id, 

4892 exception=e 

4893 ) 

4894 LOG.error(e) 

4895 raise 

4896 except (exception.ShareNotFound, exception.ShareAccessNotFound): 

4897 # Ignore the error if for any reason there is nothing to 

4898 # remove from manila, so we can still detach the share. 

4899 share_mapping.delete() 

4900 compute_utils.notify_about_share_attach_detach( 

4901 context, 

4902 instance, 

4903 instance.host, 

4904 action=fields.NotificationAction.SHARE_DETACH, 

4905 phase=fields.NotificationPhase.END, 

4906 share_id=share_mapping.share_id, 

4907 ) 

4908 

4909 _deny_share(context, instance, share_mapping) 

4910 

4911 @wrap_exception() 

4912 def _mount_all_shares(self, context, instance, share_info): 

4913 for share_mapping in share_info: 

4914 self._mount_share(context, instance, share_mapping) 

4915 

4916 @wrap_exception() 

4917 def _umount_all_shares(self, context, instance, share_info): 

4918 for share_mapping in share_info: 

4919 self._umount_share(context, instance, share_mapping) 

4920 

4921 @wrap_exception() 

4922 def _mount_share(self, context, instance, share_mapping): 

4923 

4924 @utils.synchronized(share_mapping.share_id) 

4925 def _mount_share(context, instance, share_mapping): 

4926 try: 

4927 share_mapping.set_access_according_to_protocol() 

4928 

4929 if share_mapping.share_proto == ( 

4930 fields.ShareMappingProto.CEPHFS): 

4931 share_mapping.enhance_with_ceph_credentials(context) 

4932 

4933 LOG.debug("Mounting share %s", share_mapping.share_id) 

4934 self.driver.mount_share(context, instance, share_mapping) 

4935 

4936 except ( 

4937 exception.ShareNotFound, 

4938 exception.ShareProtocolNotSupported, 

4939 exception.ShareMountError, 

4940 ) as e: 

4941 self._set_share_mapping_and_instance_in_error( 

4942 instance, share_mapping 

4943 ) 

4944 LOG.error(e.format_message()) 

4945 raise 

4946 except (sdk_exc.BadRequestException) as e: 

4947 self._set_share_mapping_and_instance_in_error( 

4948 instance, share_mapping 

4949 ) 

4950 LOG.error("%s: %s error from url: %s, %s", e.message, e.source, 

4951 e.url, e.details) 

4952 raise 

4953 

4954 _mount_share(context, instance, share_mapping) 

4955 

4956 @wrap_exception() 

4957 def _umount_share(self, context, instance, share_mapping): 

4958 

4959 @utils.synchronized(share_mapping.share_id) 

4960 def _umount_share(context, instance, share_mapping): 

4961 try: 

4962 share_mapping.set_access_according_to_protocol() 

4963 

4964 if share_mapping.share_proto == ( 

4965 fields.ShareMappingProto.CEPHFS): 

4966 share_mapping.enhance_with_ceph_credentials(context) 

4967 

4968 self.driver.umount_share(context, instance, share_mapping) 

4969 

4970 except ( 

4971 exception.ShareNotFound, 

4972 exception.ShareUmountError, 

4973 exception.ShareProtocolNotSupported, 

4974 ) as e: 

4975 self._set_share_mapping_and_instance_in_error( 

4976 instance, share_mapping 

4977 ) 

4978 LOG.error(e.format_message()) 

4979 raise 

4980 

4981 _umount_share(context, instance, share_mapping) 

4982 

4983 def _set_share_mapping_status(self, share_mapping, status): 

4984 share_mapping.status = status 

4985 share_mapping.save() 

4986 

4987 def _set_share_mapping_and_instance_in_error( 

4988 self, instance, share_mapping 

4989 ): 

4990 share_mapping.status = fields.ShareMappingStatus.ERROR 

4991 share_mapping.save() 

4992 self._set_instance_obj_error_state( 

4993 instance, clean_task_state=True 

4994 ) 

4995 

4996 @wrap_instance_fault 

4997 def _rotate_backups(self, context, instance, backup_type, rotation): 

4998 """Delete excess backups associated to an instance. 

4999 

5000 Instances are allowed a fixed number of backups (the rotation number); 

5001 this method deletes the oldest backups that exceed the rotation 

5002 threshold. 

5003 

5004 :param context: security context 

5005 :param instance: Instance dict 

5006 :param backup_type: a user-defined type, like "daily" or "weekly" etc. 

5007 :param rotation: int representing how many backups to keep around; 

5008 None if rotation shouldn't be used (as in the case of snapshots) 

5009 """ 

5010 filters = {'property-image_type': 'backup', 

5011 'property-backup_type': backup_type, 

5012 'property-instance_uuid': instance.uuid} 

5013 

5014 images = self.image_api.get_all(context, filters=filters, 

5015 sort_key='created_at', sort_dir='desc') 

5016 num_images = len(images) 

5017 LOG.debug("Found %(num_images)d images (rotation: %(rotation)d)", 

5018 {'num_images': num_images, 'rotation': rotation}, 

5019 instance=instance) 

5020 

5021 if num_images > rotation: 5021 ↛ exitline 5021 didn't return from function '_rotate_backups' because the condition on line 5021 was always true

5022 # NOTE(sirp): this deletes all backups that exceed the rotation 

5023 # limit 

5024 excess = len(images) - rotation 

5025 LOG.debug("Rotating out %d backups", excess, 

5026 instance=instance) 

5027 for i in range(excess): 

5028 image = images.pop() 

5029 image_id = image['id'] 

5030 LOG.debug("Deleting image %s", image_id, 

5031 instance=instance) 

5032 try: 

5033 self.image_api.delete(context, image_id) 

5034 except exception.ImageNotFound: 

5035 LOG.info("Failed to find image %(image_id)s to " 

5036 "delete", {'image_id': image_id}, 

5037 instance=instance) 

5038 except (exception.ImageDeleteConflict, Exception) as exc: 

5039 LOG.info("Failed to delete image %(image_id)s during " 

5040 "deleting excess backups. " 

5041 "Continuing for next image.. %(exc)s", 

5042 {'image_id': image_id, 'exc': exc}, 

5043 instance=instance) 

5044 

5045 @wrap_exception() 

5046 @reverts_task_state 

5047 @wrap_instance_event(prefix='compute') 

5048 @wrap_instance_fault 

5049 def set_admin_password(self, context, instance, new_pass): 

5050 """Set the root/admin password for an instance on this host. 

5051 

5052 This is generally only called by API password resets after an 

5053 image has been built. 

5054 

5055 @param context: Nova auth context. 

5056 @param instance: Nova instance object. 

5057 @param new_pass: The admin password for the instance. 

5058 """ 

5059 

5060 context = context.elevated() 

5061 current_power_state = self._get_power_state(instance) 

5062 expected_state = power_state.RUNNING 

5063 

5064 if current_power_state != expected_state: 

5065 instance.task_state = None 

5066 instance.save(expected_task_state=task_states.UPDATING_PASSWORD) 

5067 _msg = _('instance %s is not running') % instance.uuid 

5068 raise exception.InstancePasswordSetFailed( 

5069 instance=instance.uuid, reason=_msg) 

5070 

5071 try: 

5072 self.driver.set_admin_password(instance, new_pass) 

5073 LOG.info("Admin password set", instance=instance) 

5074 instance.task_state = None 

5075 instance.save( 

5076 expected_task_state=task_states.UPDATING_PASSWORD) 

5077 except exception.InstanceAgentNotEnabled: 

5078 with excutils.save_and_reraise_exception(): 

5079 LOG.debug('Guest agent is not enabled for the instance.', 

5080 instance=instance) 

5081 instance.task_state = None 

5082 instance.save( 

5083 expected_task_state=task_states.UPDATING_PASSWORD) 

5084 except exception.SetAdminPasswdNotSupported: 

5085 with excutils.save_and_reraise_exception(): 

5086 LOG.info('set_admin_password is not supported ' 

5087 'by this driver or guest instance.', 

5088 instance=instance) 

5089 instance.task_state = None 

5090 instance.save( 

5091 expected_task_state=task_states.UPDATING_PASSWORD) 

5092 except NotImplementedError: 

5093 LOG.warning('set_admin_password is not implemented ' 

5094 'by this driver or guest instance.', 

5095 instance=instance) 

5096 instance.task_state = None 

5097 instance.save( 

5098 expected_task_state=task_states.UPDATING_PASSWORD) 

5099 raise NotImplementedError(_('set_admin_password is not ' 

5100 'implemented by this driver or guest ' 

5101 'instance.')) 

5102 except exception.UnexpectedTaskStateError: 

5103 # interrupted by another (most likely delete) task 

5104 # do not retry 

5105 raise 

5106 except Exception: 

5107 # Catch all here because this could be anything. 

5108 LOG.exception('set_admin_password failed', instance=instance) 

5109 # We create a new exception here so that we won't 

5110 # potentially reveal password information to the 

5111 # API caller. The real exception is logged above 

5112 _msg = _('error setting admin password') 

5113 raise exception.InstancePasswordSetFailed( 

5114 instance=instance.uuid, reason=_msg) 

5115 

5116 def _get_rescue_image(self, context, instance, rescue_image_ref=None): 

5117 """Determine what image should be used to boot the rescue VM.""" 

5118 # 1. If rescue_image_ref is passed in, use that for rescue. 

5119 # 2. Else, use the base image associated with instance's current image. 

5120 # The idea here is to provide the customer with a rescue 

5121 # environment which they are familiar with. 

5122 # So, if they built their instance off of a Debian image, 

5123 # their rescue VM will also be Debian. 

5124 # 3. As a last resort, use instance's current image. 

5125 if not rescue_image_ref: 

5126 system_meta = utils.instance_sys_meta(instance) 

5127 rescue_image_ref = system_meta.get('image_base_image_ref') 

5128 

5129 if not rescue_image_ref: 

5130 LOG.warning('Unable to find a different image to use for ' 

5131 'rescue VM, using instance\'s current image', 

5132 instance=instance) 

5133 rescue_image_ref = instance.image_ref 

5134 

5135 return objects.ImageMeta.from_image_ref( 

5136 context, self.image_api, rescue_image_ref) 

5137 

5138 @wrap_exception() 

5139 @reverts_task_state 

5140 @wrap_instance_event(prefix='compute') 

5141 @wrap_instance_fault 

5142 def rescue_instance(self, context, instance, rescue_password, 

5143 rescue_image_ref, clean_shutdown): 

5144 context = context.elevated() 

5145 LOG.info('Rescuing', instance=instance) 

5146 

5147 admin_password = (rescue_password if rescue_password else 

5148 utils.generate_password()) 

5149 

5150 network_info = self.network_api.get_instance_nw_info(context, instance) 

5151 

5152 rescue_image_meta = self._get_rescue_image(context, instance, 

5153 rescue_image_ref) 

5154 

5155 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

5156 context, instance.uuid) 

5157 block_device_info = self._get_instance_block_device_info( 

5158 context, instance, bdms=bdms) 

5159 

5160 share_info = self._get_share_info(context, instance) 

5161 

5162 extra_usage_info = {'rescue_image_name': 

5163 self._get_image_name(rescue_image_meta)} 

5164 self._notify_about_instance_usage(context, instance, 

5165 "rescue.start", extra_usage_info=extra_usage_info, 

5166 network_info=network_info) 

5167 compute_utils.notify_about_instance_rescue_action( 

5168 context, instance, self.host, rescue_image_ref, 

5169 phase=fields.NotificationPhase.START) 

5170 

5171 try: 

5172 self._power_off_instance(context, instance, clean_shutdown) 

5173 

5174 self._mount_all_shares(context, instance, share_info) 

5175 

5176 self.driver.rescue(context, instance, network_info, 

5177 rescue_image_meta, admin_password, 

5178 block_device_info, share_info) 

5179 except Exception as e: 

5180 LOG.exception("Error trying to Rescue Instance", 

5181 instance=instance) 

5182 self._set_instance_obj_error_state(instance) 

5183 raise exception.InstanceNotRescuable( 

5184 instance_id=instance.uuid, 

5185 reason=_("Driver Error: %s") % e) 

5186 

5187 compute_utils.notify_usage_exists(self.notifier, context, instance, 

5188 self.host, current_period=True) 

5189 

5190 instance.vm_state = vm_states.RESCUED 

5191 instance.task_state = None 

5192 instance.power_state = self._get_power_state(instance) 

5193 instance.launched_at = timeutils.utcnow() 

5194 instance.save(expected_task_state=task_states.RESCUING) 

5195 

5196 self._notify_about_instance_usage(context, instance, 

5197 "rescue.end", extra_usage_info=extra_usage_info, 

5198 network_info=network_info) 

5199 compute_utils.notify_about_instance_rescue_action( 

5200 context, instance, self.host, rescue_image_ref, 

5201 phase=fields.NotificationPhase.END) 

5202 

5203 @wrap_exception() 

5204 @reverts_task_state 

5205 @wrap_instance_event(prefix='compute') 

5206 @wrap_instance_fault 

5207 def unrescue_instance(self, context, instance): 

5208 orig_context = context 

5209 context = context.elevated() 

5210 LOG.info('Unrescuing', instance=instance) 

5211 

5212 network_info = self.network_api.get_instance_nw_info(context, instance) 

5213 self._notify_about_instance_usage(context, instance, 

5214 "unrescue.start", network_info=network_info) 

5215 compute_utils.notify_about_instance_action(context, instance, 

5216 self.host, action=fields.NotificationAction.UNRESCUE, 

5217 phase=fields.NotificationPhase.START) 

5218 

5219 with self._error_out_instance_on_exception(context, instance): 

5220 self.driver.unrescue(orig_context, instance) 

5221 

5222 instance.vm_state = vm_states.ACTIVE 

5223 instance.task_state = None 

5224 instance.power_state = self._get_power_state(instance) 

5225 instance.save(expected_task_state=task_states.UNRESCUING) 

5226 

5227 self._notify_about_instance_usage(context, 

5228 instance, 

5229 "unrescue.end", 

5230 network_info=network_info) 

5231 compute_utils.notify_about_instance_action(context, instance, 

5232 self.host, action=fields.NotificationAction.UNRESCUE, 

5233 phase=fields.NotificationPhase.END) 

5234 

5235 @wrap_exception() 

5236 @wrap_instance_event(prefix='compute') 

5237 @errors_out_migration 

5238 @wrap_instance_fault 

5239 def confirm_resize(self, context, instance, migration): 

5240 """Confirms a migration/resize and deletes the 'old' instance. 

5241 

5242 This is called from the API and runs on the source host. 

5243 

5244 Nothing needs to happen on the destination host at this point since 

5245 the instance is already running there. This routine just cleans up the 

5246 source host. 

5247 """ 

5248 @utils.synchronized(instance.uuid) 

5249 def do_confirm_resize(context, instance, migration): 

5250 LOG.debug("Going to confirm migration %s", migration.id, 

5251 instance=instance) 

5252 

5253 if migration.status == 'confirmed': 

5254 LOG.info("Migration %s is already confirmed", 

5255 migration.id, instance=instance) 

5256 return 

5257 

5258 if migration.status not in ('finished', 'confirming'): 

5259 LOG.warning("Unexpected confirmation status '%(status)s' " 

5260 "of migration %(id)s, exit confirmation process", 

5261 {"status": migration.status, "id": migration.id}, 

5262 instance=instance) 

5263 return 

5264 

5265 # NOTE(wangpan): Get the instance from db, if it has been 

5266 # deleted, we do nothing and return here 

5267 expected_attrs = ['metadata', 'system_metadata', 'flavor'] 

5268 try: 

5269 instance = objects.Instance.get_by_uuid( 

5270 context, instance.uuid, 

5271 expected_attrs=expected_attrs) 

5272 except exception.InstanceNotFound: 

5273 LOG.info("Instance is not found during confirmation", 

5274 instance=instance) 

5275 return 

5276 

5277 with self._error_out_instance_on_exception(context, instance): 

5278 try: 

5279 self._confirm_resize( 

5280 context, instance, migration=migration) 

5281 except Exception: 

5282 # Something failed when cleaning up the source host so 

5283 # log a traceback and leave a hint about hard rebooting 

5284 # the server to correct its state in the DB. 

5285 with excutils.save_and_reraise_exception(logger=LOG): 

5286 LOG.exception( 

5287 'Confirm resize failed on source host %s. ' 

5288 'Resource allocations in the placement service ' 

5289 'will be removed regardless because the instance ' 

5290 'is now on the destination host %s. You can try ' 

5291 'hard rebooting the instance to correct its ' 

5292 'state.', self.host, migration.dest_compute, 

5293 instance=instance) 

5294 finally: 

5295 # Whether an error occurred or not, at this point the 

5296 # instance is on the dest host. Avoid leaking allocations 

5297 # in placement by deleting them here... 

5298 self._delete_allocation_after_move( 

5299 context, instance, migration) 

5300 # ...inform the scheduler about the move... 

5301 self._delete_scheduler_instance_info( 

5302 context, instance.uuid) 

5303 # ...and unset the cached flavor information (this is done 

5304 # last since the resource tracker relies on it for its 

5305 # periodic tasks) 

5306 self._delete_stashed_flavor_info(instance) 

5307 

5308 do_confirm_resize(context, instance, migration) 

5309 

5310 def _get_updated_nw_info_with_pci_mapping(self, nw_info, pci_mapping): 

5311 # NOTE(adrianc): This method returns a copy of nw_info if modifications 

5312 # are made else it returns the original nw_info. 

5313 updated_nw_info = nw_info 

5314 if nw_info and pci_mapping: 

5315 updated_nw_info = copy.deepcopy(nw_info) 

5316 for vif in updated_nw_info: 

5317 if vif['vnic_type'] in network_model.VNIC_TYPES_SRIOV: 

5318 try: 

5319 vif_pci_addr = vif['profile']['pci_slot'] 

5320 new_addr = pci_mapping[vif_pci_addr].address 

5321 vif['profile']['pci_slot'] = new_addr 

5322 LOG.debug("Updating VIF's PCI address for VIF %(id)s. " 

5323 "Original value %(orig_val)s, " 

5324 "new value %(new_val)s", 

5325 {'id': vif['id'], 

5326 'orig_val': vif_pci_addr, 

5327 'new_val': new_addr}) 

5328 except (KeyError, AttributeError): 

5329 with excutils.save_and_reraise_exception(): 

5330 # NOTE(adrianc): This should never happen. If we 

5331 # get here it means there is some inconsistency 

5332 # with either 'nw_info' or 'pci_mapping'. 

5333 LOG.error("Unexpected error when updating network " 

5334 "information with PCI mapping.") 

5335 return updated_nw_info 

5336 

5337 def _confirm_resize(self, context, instance, migration=None): 

5338 """Destroys the source instance.""" 

5339 self._notify_about_instance_usage(context, instance, 

5340 "resize.confirm.start") 

5341 compute_utils.notify_about_instance_action(context, instance, 

5342 self.host, action=fields.NotificationAction.RESIZE_CONFIRM, 

5343 phase=fields.NotificationPhase.START) 

5344 

5345 # NOTE(tr3buchet): tear down networks on source host 

5346 self.network_api.setup_networks_on_host(context, instance, 

5347 migration.source_compute, teardown=True) 

5348 

5349 # TODO(stephenfin): These next three calls should be bundled 

5350 network_info = self.network_api.get_instance_nw_info(context, 

5351 instance) 

5352 

5353 # NOTE(adrianc): Populate old PCI device in VIF profile 

5354 # to allow virt driver to properly unplug it from Hypervisor. 

5355 pci_mapping = (instance.migration_context. 

5356 get_pci_mapping_for_migration(True)) 

5357 network_info = self._get_updated_nw_info_with_pci_mapping( 

5358 network_info, pci_mapping) 

5359 

5360 self.driver.confirm_migration(context, migration, instance, 

5361 network_info) 

5362 

5363 # Free up the old_flavor usage from the resource tracker for this host. 

5364 self.rt.drop_move_claim_at_source(context, instance, migration) 

5365 

5366 # NOTE(mriedem): The old_vm_state could be STOPPED but the user 

5367 # might have manually powered up the instance to confirm the 

5368 # resize/migrate, so we need to check the current power state 

5369 # on the instance and set the vm_state appropriately. We default 

5370 # to ACTIVE because if the power state is not SHUTDOWN, we 

5371 # assume _sync_instance_power_state will clean it up. 

5372 p_state = instance.power_state 

5373 vm_state = None 

5374 if p_state == power_state.SHUTDOWN: 

5375 vm_state = vm_states.STOPPED 

5376 LOG.debug("Resized/migrated instance is powered off. " 

5377 "Setting vm_state to '%s'.", vm_state, 

5378 instance=instance) 

5379 else: 

5380 vm_state = vm_states.ACTIVE 

5381 

5382 instance.vm_state = vm_state 

5383 instance.task_state = None 

5384 instance.save(expected_task_state=[None, task_states.DELETING, 

5385 task_states.SOFT_DELETING]) 

5386 

5387 self._notify_about_instance_usage( 

5388 context, instance, "resize.confirm.end", 

5389 network_info=network_info) 

5390 compute_utils.notify_about_instance_action(context, instance, 

5391 self.host, action=fields.NotificationAction.RESIZE_CONFIRM, 

5392 phase=fields.NotificationPhase.END) 

5393 

5394 def _delete_allocation_after_move(self, context, instance, migration): 

5395 """Deletes resource allocations held by the migration record against 

5396 the source compute node resource provider after a confirmed cold / 

5397 successful live migration. 

5398 """ 

5399 try: 

5400 # NOTE(danms): We're finishing on the source node, so try 

5401 # to delete the allocation based on the migration uuid 

5402 self.reportclient.delete_allocation_for_instance( 

5403 context, migration.uuid, consumer_type='migration', 

5404 force=False) 

5405 except exception.AllocationDeleteFailed: 

5406 LOG.error('Deleting allocation in placement for migration ' 

5407 '%(migration_uuid)s failed. The instance ' 

5408 '%(instance_uuid)s will be put to ERROR state ' 

5409 'but the allocation held by the migration is ' 

5410 'leaked.', 

5411 {'instance_uuid': instance.uuid, 

5412 'migration_uuid': migration.uuid}) 

5413 raise 

5414 

5415 def _delete_stashed_flavor_info(self, instance): 

5416 """Remove information about the flavor change after a resize.""" 

5417 instance.old_flavor = None 

5418 instance.new_flavor = None 

5419 instance.system_metadata.pop('old_vm_state', None) 

5420 instance.save() 

5421 

5422 @wrap_exception() 

5423 @wrap_instance_event(prefix='compute') 

5424 @errors_out_migration 

5425 @wrap_instance_fault 

5426 def confirm_snapshot_based_resize_at_source( 

5427 self, ctxt, instance, migration): 

5428 """Confirms a snapshot-based resize on the source host. 

5429 

5430 Cleans the guest from the source hypervisor including disks and drops 

5431 the MoveClaim which will free up "old_flavor" usage from the 

5432 ResourceTracker. 

5433 

5434 Deletes the allocations held by the migration consumer against the 

5435 source compute node resource provider. 

5436 

5437 :param ctxt: nova auth request context targeted at the source cell 

5438 :param instance: Instance object being resized which should have the 

5439 "old_flavor" attribute set 

5440 :param migration: Migration object for the resize operation 

5441 """ 

5442 

5443 @utils.synchronized(instance.uuid) 

5444 def do_confirm(): 

5445 LOG.info('Confirming resize on source host.', instance=instance) 

5446 with self._error_out_instance_on_exception(ctxt, instance): 

5447 # TODO(mriedem): Could probably make this try/except/finally 

5448 # a context manager to share with confirm_resize(). 

5449 try: 

5450 self._confirm_snapshot_based_resize_at_source( 

5451 ctxt, instance, migration) 

5452 except Exception: 

5453 # Something failed when cleaning up the source host so 

5454 # log a traceback and leave a hint about hard rebooting 

5455 # the server to correct its state in the DB. 

5456 with excutils.save_and_reraise_exception(logger=LOG): 

5457 LOG.exception( 

5458 'Confirm resize failed on source host %s. ' 

5459 'Resource allocations in the placement service ' 

5460 'will be removed regardless because the instance ' 

5461 'is now on the destination host %s. You can try ' 

5462 'hard rebooting the instance to correct its ' 

5463 'state.', self.host, migration.dest_compute, 

5464 instance=instance) 

5465 finally: 

5466 # Whether an error occurred or not, at this point the 

5467 # instance is on the dest host so to avoid leaking 

5468 # allocations in placement, delete them here. 

5469 # TODO(mriedem): Should we catch and just log 

5470 # AllocationDeleteFailed? What is the user's recourse if 

5471 # we got this far but this fails? At this point the 

5472 # instance is on the target host and the allocations 

5473 # could just be manually cleaned up by the operator. 

5474 self._delete_allocation_after_move(ctxt, instance, 

5475 migration) 

5476 do_confirm() 

5477 

5478 def _confirm_snapshot_based_resize_at_source( 

5479 self, ctxt, instance, migration): 

5480 """Private version of confirm_snapshot_based_resize_at_source 

5481 

5482 This allows the main method to be decorated with error handlers. 

5483 

5484 :param ctxt: nova auth request context targeted at the source cell 

5485 :param instance: Instance object being resized which should have the 

5486 "old_flavor" attribute set 

5487 :param migration: Migration object for the resize operation 

5488 """ 

5489 # Cleanup the guest from the hypervisor including local disks. 

5490 network_info = self.network_api.get_instance_nw_info(ctxt, instance) 

5491 LOG.debug('Cleaning up guest from source hypervisor including disks.', 

5492 instance=instance) 

5493 

5494 # FIXME(mriedem): Per bug 1809095, _confirm_resize calls 

5495 # _get_updated_nw_info_with_pci_mapping here prior to unplugging 

5496 # VIFs on the source, but in our case we have already unplugged 

5497 # VIFs during prep_snapshot_based_resize_at_source, so what do we 

5498 # need to do about those kinds of ports? Do we need to wait to unplug 

5499 # VIFs until confirm like normal resize? 

5500 

5501 # Note that prep_snapshot_based_resize_at_source already destroyed the 

5502 # guest which disconnected volumes and unplugged VIFs but did not 

5503 # destroy disks in case something failed during the resize and the 

5504 # instance needed to be rebooted or rebuilt on the source host. Now 

5505 # that we are confirming the resize we want to cleanup the disks left 

5506 # on the source host. We call cleanup() instead of destroy() to avoid 

5507 # any InstanceNotFound confusion from the driver since the guest was 

5508 # already destroyed on this host. block_device_info=None and 

5509 # destroy_vifs=False means cleanup() will not try to disconnect volumes 

5510 # or unplug VIFs. 

5511 self.driver.cleanup( 

5512 ctxt, instance, network_info, block_device_info=None, 

5513 destroy_disks=True, destroy_vifs=False) 

5514 

5515 # Delete port bindings for the source host. 

5516 self._confirm_snapshot_based_resize_delete_port_bindings( 

5517 ctxt, instance) 

5518 

5519 # Delete volume attachments for the source host. 

5520 self._delete_volume_attachments(ctxt, instance.get_bdms()) 

5521 

5522 # Free up the old_flavor usage from the resource tracker for this host. 

5523 self.rt.drop_move_claim_at_source(ctxt, instance, migration) 

5524 

5525 def _confirm_snapshot_based_resize_delete_port_bindings( 

5526 self, ctxt, instance): 

5527 """Delete port bindings for the source host when confirming 

5528 snapshot-based resize on the source host." 

5529 

5530 :param ctxt: nova auth RequestContext 

5531 :param instance: Instance object that was resized/cold migrated 

5532 """ 

5533 LOG.debug('Deleting port bindings for source host.', 

5534 instance=instance) 

5535 try: 

5536 self.network_api.cleanup_instance_network_on_host( 

5537 ctxt, instance, self.host) 

5538 except exception.PortBindingDeletionFailed as e: 

5539 # Do not let this stop us from cleaning up since the guest 

5540 # is already gone. 

5541 LOG.error('Failed to delete port bindings from source host. ' 

5542 'Error: %s', str(e), instance=instance) 

5543 

5544 def _delete_volume_attachments(self, ctxt, bdms): 

5545 """Deletes volume attachment records for the given bdms. 

5546 

5547 This method will log but not re-raise any exceptions if the volume 

5548 attachment delete fails. 

5549 

5550 :param ctxt: nova auth request context used to make 

5551 DELETE /attachments/{attachment_id} requests to cinder. 

5552 :param bdms: objects.BlockDeviceMappingList representing volume 

5553 attachments to delete based on BlockDeviceMapping.attachment_id. 

5554 """ 

5555 for bdm in bdms: 

5556 if bdm.attachment_id: 

5557 try: 

5558 self.volume_api.attachment_delete(ctxt, bdm.attachment_id) 

5559 except Exception as e: 

5560 LOG.error('Failed to delete volume attachment with ID %s. ' 

5561 'Error: %s', bdm.attachment_id, str(e), 

5562 instance_uuid=bdm.instance_uuid) 

5563 

5564 def _update_bdm_for_swap_to_finish_resize( 

5565 self, context, instance, confirm=True): 

5566 """This updates bdm.swap with new swap info""" 

5567 

5568 bdms = instance.get_bdms() 

5569 if not (instance.old_flavor and instance.new_flavor): 

5570 return bdms 

5571 

5572 if instance.old_flavor.swap == instance.new_flavor.swap: 

5573 return bdms 

5574 

5575 old_swap = instance.old_flavor.swap 

5576 new_swap = instance.new_flavor.swap 

5577 if not confirm: 5577 ↛ 5579line 5577 didn't jump to line 5579 because the condition on line 5577 was never true

5578 # revert flavor on _finish_revert_resize 

5579 old_swap = instance.new_flavor.swap 

5580 new_swap = instance.old_flavor.swap 

5581 

5582 # add swap 

5583 if old_swap == 0 and new_swap: 

5584 # (auniyal)old_swap = 0 means we did not have swap bdm 

5585 # for this instance. 

5586 # and as there is a new_swap, its a swap addition 

5587 new_swap_bdm = block_device.create_blank_bdm(new_swap, 'swap') 

5588 bdm_obj = objects.BlockDeviceMapping( 

5589 context, instance_uuid=instance.uuid, **new_swap_bdm) 

5590 bdm_obj.update_or_create() 

5591 return instance.get_bdms() 

5592 

5593 # update swap 

5594 for bdm in bdms: 5594 ↛ 5606line 5594 didn't jump to line 5606 because the loop on line 5594 didn't complete

5595 if bdm.guest_format == 'swap' and bdm.device_type == 'disk': 5595 ↛ 5594line 5595 didn't jump to line 5594 because the condition on line 5595 was always true

5596 if new_swap > 0: 

5597 LOG.info('Adding swap BDM.', instance=instance) 

5598 bdm.volume_size = new_swap 

5599 bdm.save() 

5600 break 

5601 elif new_swap == 0: 5601 ↛ 5594line 5601 didn't jump to line 5594 because the condition on line 5601 was always true

5602 LOG.info('Deleting swap BDM.', instance=instance) 

5603 bdm.destroy() 

5604 bdms.objects.remove(bdm) 

5605 break 

5606 return bdms 

5607 

5608 @wrap_exception() 

5609 @reverts_task_state 

5610 @wrap_instance_event(prefix='compute') 

5611 @errors_out_migration 

5612 @wrap_instance_fault 

5613 def revert_snapshot_based_resize_at_dest(self, ctxt, instance, migration): 

5614 """Reverts a snapshot-based resize at the destination host. 

5615 

5616 Cleans the guest from the destination compute service host hypervisor 

5617 and related resources (ports, volumes) and frees resource usage from 

5618 the compute service on that host. 

5619 

5620 :param ctxt: nova auth request context targeted at the target cell 

5621 :param instance: Instance object whose vm_state is "resized" and 

5622 task_state is "resize_reverting". 

5623 :param migration: Migration object whose status is "reverting". 

5624 """ 

5625 # A resize revert is essentially a resize back to the old size, so we 

5626 # need to send a usage event here. 

5627 compute_utils.notify_usage_exists( 

5628 self.notifier, ctxt, instance, self.host, current_period=True) 

5629 

5630 @utils.synchronized(instance.uuid) 

5631 def do_revert(): 

5632 LOG.info('Reverting resize on destination host.', 

5633 instance=instance) 

5634 with self._error_out_instance_on_exception(ctxt, instance): 

5635 self._revert_snapshot_based_resize_at_dest( 

5636 ctxt, instance, migration) 

5637 do_revert() 

5638 

5639 # Broadcast to all schedulers that the instance is no longer on 

5640 # this host and clear any waiting callback events. This is best effort 

5641 # so if anything fails just log it. 

5642 try: 

5643 self._delete_scheduler_instance_info(ctxt, instance.uuid) 

5644 self.instance_events.clear_events_for_instance(instance) 

5645 except Exception as e: 

5646 LOG.warning('revert_snapshot_based_resize_at_dest failed during ' 

5647 'post-processing. Error: %s', e, instance=instance) 

5648 

5649 def _revert_snapshot_based_resize_at_dest( 

5650 self, ctxt, instance, migration): 

5651 """Private version of revert_snapshot_based_resize_at_dest. 

5652 

5653 This allows the main method to be decorated with error handlers. 

5654 

5655 :param ctxt: nova auth request context targeted at the target cell 

5656 :param instance: Instance object whose vm_state is "resized" and 

5657 task_state is "resize_reverting". 

5658 :param migration: Migration object whose status is "reverting". 

5659 """ 

5660 # Cleanup the guest from the hypervisor including local disks. 

5661 network_info = self.network_api.get_instance_nw_info(ctxt, instance) 

5662 bdms = instance.get_bdms() 

5663 block_device_info = self._get_instance_block_device_info( 

5664 ctxt, instance, bdms=bdms) 

5665 LOG.debug('Destroying guest from destination hypervisor including ' 

5666 'disks.', instance=instance) 

5667 self.driver.destroy( 

5668 ctxt, instance, network_info, block_device_info=block_device_info) 

5669 

5670 # Activate source host port bindings. We need to do this before 

5671 # deleting the (active) dest host port bindings in 

5672 # setup_networks_on_host otherwise the ports will be unbound and 

5673 # finish on the source will fail. 

5674 # migrate_instance_start uses migration.dest_compute for the port 

5675 # binding host and since we want to activate the source host port 

5676 # bindings, we need to temporarily mutate the migration object. 

5677 with utils.temporary_mutation( 

5678 migration, dest_compute=migration.source_compute): 

5679 LOG.debug('Activating port bindings for source host %s.', 

5680 migration.source_compute, instance=instance) 

5681 # TODO(mriedem): https://review.opendev.org/#/c/594139/ would allow 

5682 # us to remove this and make setup_networks_on_host do it. 

5683 # TODO(mriedem): Should we try/except/log any errors but continue? 

5684 self.network_api.migrate_instance_start( 

5685 ctxt, instance, migration) 

5686 

5687 # Delete port bindings for the target host. 

5688 LOG.debug('Deleting port bindings for target host %s.', 

5689 self.host, instance=instance) 

5690 try: 

5691 # Note that deleting the destination host port bindings does 

5692 # not automatically activate the source host port bindings. 

5693 self.network_api.cleanup_instance_network_on_host( 

5694 ctxt, instance, self.host) 

5695 except exception.PortBindingDeletionFailed as e: 

5696 # Do not let this stop us from cleaning up since the guest 

5697 # is already gone. 

5698 LOG.error('Failed to delete port bindings from target host. ' 

5699 'Error: %s', str(e), instance=instance) 

5700 

5701 # Delete any volume attachments remaining for this target host. 

5702 LOG.debug('Deleting volume attachments for target host.', 

5703 instance=instance) 

5704 self._delete_volume_attachments(ctxt, bdms) 

5705 

5706 # Free up the new_flavor usage from the resource tracker for this host. 

5707 self.rt.drop_move_claim_at_dest(ctxt, instance, migration) 

5708 

5709 def _revert_instance_flavor_host_node(self, instance, migration): 

5710 """Revert host, node and flavor fields after a resize-revert.""" 

5711 self._set_instance_info(instance, instance.old_flavor) 

5712 instance.host = migration.source_compute 

5713 # NOTE(danms): This could fail if we somehow ended up on the wrong 

5714 # host without the desired node. That's a big problem, so let the 

5715 # ComputeHostNotFound raise from here. 

5716 cn = self.rt.get_node_by_name(migration.source_node) 

5717 instance.node = cn.hypervisor_hostname 

5718 instance.compute_id = cn.id 

5719 instance.save(expected_task_state=[task_states.RESIZE_REVERTING]) 

5720 

5721 @wrap_exception() 

5722 @reverts_task_state 

5723 @wrap_instance_event(prefix='compute') 

5724 @errors_out_migration 

5725 @wrap_instance_fault 

5726 def finish_revert_snapshot_based_resize_at_source( 

5727 self, ctxt, instance, migration): 

5728 """Reverts a snapshot-based resize at the source host. 

5729 

5730 Spawn the guest and re-connect volumes/VIFs on the source host and 

5731 revert the instance to use the old_flavor for resource usage reporting. 

5732 

5733 Updates allocations in the placement service to move the source node 

5734 allocations, held by the migration record, to the instance and drop 

5735 the allocations held by the instance on the destination node. 

5736 

5737 :param ctxt: nova auth request context targeted at the target cell 

5738 :param instance: Instance object whose vm_state is "resized" and 

5739 task_state is "resize_reverting". 

5740 :param migration: Migration object whose status is "reverting". 

5741 """ 

5742 

5743 @utils.synchronized(instance.uuid) 

5744 def do_revert(): 

5745 LOG.info('Reverting resize on source host.', instance=instance) 

5746 with self._error_out_instance_on_exception(ctxt, instance): 

5747 self._finish_revert_snapshot_based_resize_at_source( 

5748 ctxt, instance, migration) 

5749 

5750 try: 

5751 do_revert() 

5752 finally: 

5753 self._delete_stashed_flavor_info(instance) 

5754 

5755 # Broadcast to all schedulers that the instance is on this host. 

5756 # This is best effort so if anything fails just log it. 

5757 try: 

5758 self._update_scheduler_instance_info(ctxt, instance) 

5759 except Exception as e: 

5760 LOG.warning('finish_revert_snapshot_based_resize_at_source failed ' 

5761 'during post-processing. Error: %s', e, 

5762 instance=instance) 

5763 

5764 def _finish_revert_snapshot_based_resize_at_source( 

5765 self, ctxt, instance, migration): 

5766 """Private version of finish_revert_snapshot_based_resize_at_source. 

5767 

5768 This allows the main method to be decorated with error handlers. 

5769 

5770 :param ctxt: nova auth request context targeted at the source cell 

5771 :param instance: Instance object whose vm_state is "resized" and 

5772 task_state is "resize_reverting". 

5773 :param migration: Migration object whose status is "reverting". 

5774 """ 

5775 # Get stashed old_vm_state information to determine if guest should 

5776 # be powered on after spawn; we default to ACTIVE for backwards 

5777 # compatibility if old_vm_state is not set 

5778 old_vm_state = instance.system_metadata.get( 

5779 'old_vm_state', vm_states.ACTIVE) 

5780 

5781 # Revert the flavor and host/node fields to their previous values 

5782 self._revert_instance_flavor_host_node(instance, migration) 

5783 

5784 # Move the allocations against the source compute node resource 

5785 # provider, held by the migration, to the instance which will drop 

5786 # the destination compute node resource provider allocations held by 

5787 # the instance. This puts the allocations against the source node 

5788 # back to the old_flavor and owned by the instance. 

5789 try: 

5790 self._revert_allocation(ctxt, instance, migration) 

5791 except exception.AllocationMoveFailed: 

5792 # Log the error but do not re-raise because we want to continue to 

5793 # process ports and volumes below. 

5794 LOG.error('Reverting allocation in placement for migration ' 

5795 '%(migration_uuid)s failed. You may need to manually ' 

5796 'remove the allocations for the migration consumer ' 

5797 'against the source node resource provider ' 

5798 '%(source_provider)s and the allocations for the ' 

5799 'instance consumer against the destination node ' 

5800 'resource provider %(dest_provider)s and then run the ' 

5801 '"nova-manage placement heal_allocations" command.', 

5802 {'instance_uuid': instance.uuid, 

5803 'migration_uuid': migration.uuid, 

5804 'source_provider': migration.source_node, 

5805 'dest_provider': migration.dest_node}, 

5806 instance=instance) 

5807 

5808 bdms = instance.get_bdms() 

5809 # prep_snapshot_based_resize_at_source created empty volume attachments 

5810 # that we need to update here to get the connection_info before calling 

5811 # driver.finish_revert_migration which will connect the volumes to this 

5812 # host. 

5813 LOG.debug('Updating volume attachments for target host %s.', 

5814 self.host, instance=instance) 

5815 # TODO(mriedem): We should probably make _update_volume_attachments 

5816 # (optionally) graceful to errors so we (1) try to process all 

5817 # attachments and (2) continue to process networking below. 

5818 self._update_volume_attachments(ctxt, instance, bdms) 

5819 

5820 LOG.debug('Updating port bindings for source host %s.', 

5821 self.host, instance=instance) 

5822 # TODO(mriedem): Calculate provider mappings when we support 

5823 # cross-cell resize/migrate with ports having resource requests. 

5824 # NOTE(hanrong): we need to change migration.dest_compute to 

5825 # source host temporarily. 

5826 # "network_api.migrate_instance_finish" will setup the network 

5827 # for the instance on the destination host. For revert resize, 

5828 # the instance will back to the source host, the setup of the 

5829 # network for instance should be on the source host. So set 

5830 # the migration.dest_compute to source host at here. 

5831 with utils.temporary_mutation( 

5832 migration, dest_compute=migration.source_compute 

5833 ): 

5834 self.network_api.migrate_instance_finish( 

5835 ctxt, instance, migration, provider_mappings=None) 

5836 network_info = self.network_api.get_instance_nw_info(ctxt, instance) 

5837 

5838 # Remember that prep_snapshot_based_resize_at_source destroyed the 

5839 # guest but left the disks intact so we cannot call spawn() here but 

5840 # finish_revert_migration should do the job. 

5841 block_device_info = self._get_instance_block_device_info( 

5842 ctxt, instance, bdms=bdms) 

5843 power_on = old_vm_state == vm_states.ACTIVE 

5844 driver_error = None 

5845 try: 

5846 self.driver.finish_revert_migration( 

5847 ctxt, instance, network_info, migration, 

5848 block_device_info=block_device_info, power_on=power_on) 

5849 except Exception as e: 

5850 driver_error = e 

5851 # Leave a hint about hard rebooting the guest and reraise so the 

5852 # instance is put into ERROR state. 

5853 with excutils.save_and_reraise_exception(logger=LOG): 

5854 LOG.error('An error occurred during finish_revert_migration. ' 

5855 'The instance may need to be hard rebooted. Error: ' 

5856 '%s', driver_error, instance=instance) 

5857 else: 

5858 # Perform final cleanup of the instance in the database. 

5859 instance.drop_migration_context() 

5860 # If the original vm_state was STOPPED, set it back to STOPPED. 

5861 vm_state = vm_states.ACTIVE if power_on else vm_states.STOPPED 

5862 self._update_instance_after_spawn(instance, vm_state=vm_state) 

5863 instance.save(expected_task_state=[task_states.RESIZE_REVERTING]) 

5864 finally: 

5865 # Complete any volume attachments so the volumes are in-use. We 

5866 # do this regardless of finish_revert_migration failing because 

5867 # the instance is back on this host now and we do not want to leave 

5868 # the volumes in a pending state in case the instance is hard 

5869 # rebooted. 

5870 LOG.debug('Completing volume attachments for instance on source ' 

5871 'host.', instance=instance) 

5872 with excutils.save_and_reraise_exception( 

5873 reraise=driver_error is not None, logger=LOG): 

5874 self._complete_volume_attachments(ctxt, bdms) 

5875 

5876 migration.status = 'reverted' 

5877 migration.save() 

5878 

5879 @wrap_exception() 

5880 @reverts_task_state 

5881 @wrap_instance_event(prefix='compute') 

5882 @errors_out_migration 

5883 @wrap_instance_fault 

5884 def revert_resize(self, context, instance, migration, request_spec): 

5885 """Destroys the new instance on the destination machine. 

5886 

5887 Reverts the model changes, and powers on the old instance on the 

5888 source machine. 

5889 

5890 """ 

5891 # NOTE(comstud): A revert_resize is essentially a resize back to 

5892 # the old size, so we need to send a usage event here. 

5893 compute_utils.notify_usage_exists(self.notifier, context, instance, 

5894 self.host, current_period=True) 

5895 

5896 with self._error_out_instance_on_exception(context, instance): 

5897 # NOTE(tr3buchet): tear down networks on destination host 

5898 self.network_api.setup_networks_on_host(context, instance, 

5899 teardown=True) 

5900 

5901 self.network_api.migrate_instance_start(context, 

5902 instance, 

5903 migration) 

5904 

5905 network_info = self.network_api.get_instance_nw_info(context, 

5906 instance) 

5907 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

5908 context, instance.uuid) 

5909 block_device_info = self._get_instance_block_device_info( 

5910 context, instance, bdms=bdms) 

5911 

5912 destroy_disks = not self._is_instance_storage_shared( 

5913 context, instance, host=migration.source_compute) 

5914 self.driver.destroy(context, instance, network_info, 

5915 block_device_info, destroy_disks) 

5916 

5917 self._terminate_volume_connections(context, instance, bdms) 

5918 

5919 # Free up the new_flavor usage from the resource tracker for this 

5920 # host. 

5921 self.rt.drop_move_claim_at_dest(context, instance, migration) 

5922 

5923 # RPC cast back to the source host to finish the revert there. 

5924 self.compute_rpcapi.finish_revert_resize(context, instance, 

5925 migration, migration.source_compute, request_spec) 

5926 

5927 @wrap_exception() 

5928 @reverts_task_state 

5929 @wrap_instance_event(prefix='compute') 

5930 @errors_out_migration 

5931 @wrap_instance_fault 

5932 def finish_revert_resize(self, context, instance, migration, request_spec): 

5933 """Finishes the second half of reverting a resize on the source host. 

5934 

5935 Bring the original source instance state back (active/shutoff) and 

5936 revert the resized attributes in the database. 

5937 

5938 """ 

5939 try: 

5940 self._finish_revert_resize( 

5941 context, instance, migration, request_spec) 

5942 finally: 

5943 self._delete_stashed_flavor_info(instance) 

5944 

5945 def _finish_revert_resize( 

5946 self, context, instance, migration, request_spec=None, 

5947 ): 

5948 """Inner version of finish_revert_resize.""" 

5949 with self._error_out_instance_on_exception(context, instance): 

5950 bdms = self._update_bdm_for_swap_to_finish_resize( 

5951 context, instance, confirm=False) 

5952 

5953 self._notify_about_instance_usage( 

5954 context, instance, "resize.revert.start") 

5955 compute_utils.notify_about_instance_action(context, instance, 

5956 self.host, action=fields.NotificationAction.RESIZE_REVERT, 

5957 phase=fields.NotificationPhase.START, bdms=bdms) 

5958 

5959 # Get stashed old_vm_state information to determine if guest should 

5960 # be powered on after spawn; we default to ACTIVE for backwards 

5961 # compatibility if old_vm_state is not set 

5962 old_vm_state = instance.system_metadata.get( 

5963 'old_vm_state', vm_states.ACTIVE) 

5964 

5965 # Revert the flavor and host/node fields to their previous values 

5966 self._revert_instance_flavor_host_node(instance, migration) 

5967 

5968 try: 

5969 source_allocations = self._revert_allocation( 

5970 context, instance, migration) 

5971 except exception.AllocationMoveFailed: 

5972 LOG.error('Reverting allocation in placement for migration ' 

5973 '%(migration_uuid)s failed. The instance ' 

5974 '%(instance_uuid)s will be put into ERROR state but ' 

5975 'the allocation held by the migration is leaked.', 

5976 {'instance_uuid': instance.uuid, 

5977 'migration_uuid': migration.uuid}) 

5978 raise 

5979 

5980 provider_mappings = self._fill_provider_mapping_based_on_allocs( 

5981 context, source_allocations, request_spec) 

5982 

5983 self.network_api.setup_networks_on_host(context, instance, 

5984 migration.source_compute) 

5985 # NOTE(hanrong): we need to change migration.dest_compute to 

5986 # source host temporarily. "network_api.migrate_instance_finish" 

5987 # will setup the network for the instance on the destination host. 

5988 # For revert resize, the instance will back to the source host, the 

5989 # setup of the network for instance should be on the source host. 

5990 # So set the migration.dest_compute to source host at here. 

5991 with utils.temporary_mutation( 

5992 migration, dest_compute=migration.source_compute): 

5993 self.network_api.migrate_instance_finish(context, 

5994 instance, 

5995 migration, 

5996 provider_mappings) 

5997 network_info = self.network_api.get_instance_nw_info(context, 

5998 instance) 

5999 

6000 # revert_resize deleted any volume attachments for the instance 

6001 # and created new ones to be used on this host, but we 

6002 # have to update those attachments with the host connector so the 

6003 # BDM.connection_info will get set in the call to 

6004 # _get_instance_block_device_info below with refresh_conn_info=True 

6005 # and then the volumes can be re-connected via the driver on this 

6006 # host. 

6007 self._update_volume_attachments(context, instance, bdms) 

6008 

6009 block_device_info = self._get_instance_block_device_info( 

6010 context, instance, refresh_conn_info=True, bdms=bdms) 

6011 

6012 power_on = old_vm_state != vm_states.STOPPED 

6013 self.driver.finish_revert_migration( 

6014 context, instance, network_info, migration, block_device_info, 

6015 power_on) 

6016 

6017 instance.drop_migration_context() 

6018 instance.launched_at = timeutils.utcnow() 

6019 instance.save(expected_task_state=task_states.RESIZE_REVERTING) 

6020 

6021 # Complete any volume attachments so the volumes are in-use. 

6022 self._complete_volume_attachments(context, bdms) 

6023 

6024 # if the original vm state was STOPPED, set it back to STOPPED 

6025 LOG.info("Updating instance to original state: '%s'", 

6026 old_vm_state, instance=instance) 

6027 if power_on: 6027 ↛ 6032line 6027 didn't jump to line 6032 because the condition on line 6027 was always true

6028 instance.vm_state = vm_states.ACTIVE 

6029 instance.task_state = None 

6030 instance.save() 

6031 else: 

6032 instance.task_state = task_states.POWERING_OFF 

6033 instance.save() 

6034 self.stop_instance(context, instance=instance, 

6035 clean_shutdown=True) 

6036 

6037 self._notify_about_instance_usage( 

6038 context, instance, "resize.revert.end") 

6039 compute_utils.notify_about_instance_action(context, instance, 

6040 self.host, action=fields.NotificationAction.RESIZE_REVERT, 

6041 phase=fields.NotificationPhase.END, bdms=bdms) 

6042 

6043 def _fill_provider_mapping_based_on_allocs( 

6044 self, context, allocations, request_spec): 

6045 """Fills and returns the request group - resource provider mapping 

6046 based on the allocation passed in. 

6047 

6048 :param context: The security context 

6049 :param allocation: allocation dict keyed by RP UUID. 

6050 :param request_spec: The RequestSpec object associated with the 

6051 operation 

6052 :returns: None if the request_spec is None. Otherwise a mapping 

6053 between RequestGroup requester_id, currently Neutron port_id, 

6054 and a list of resource provider UUIDs providing resource for 

6055 that RequestGroup. 

6056 """ 

6057 if request_spec: 

6058 # NOTE(gibi): We need to re-calculate the resource provider - 

6059 # port mapping as we have to have the neutron ports allocate 

6060 # from the source compute after revert. 

6061 scheduler_utils.fill_provider_mapping_based_on_allocation( 

6062 context, self.reportclient, request_spec, allocations) 

6063 provider_mappings = self._get_request_group_mapping( 

6064 request_spec) 

6065 else: 

6066 # TODO(sbauza): Remove this conditional once we only support RPC 

6067 # API 6.0 

6068 # NOTE(gibi): The compute RPC is pinned to be older than 5.2 

6069 # and therefore request_spec is not sent. We cannot calculate 

6070 # the provider mappings. If the instance has ports with 

6071 # resource request then the port update will fail in 

6072 # _update_port_binding_for_instance() called via 

6073 # migrate_instance_finish() in finish_revert_resize. 

6074 provider_mappings = None 

6075 return provider_mappings 

6076 

6077 def _revert_allocation(self, context, instance, migration): 

6078 """Revert an allocation that is held by migration to our instance.""" 

6079 

6080 # Fetch the original allocation that the instance had on the source 

6081 # node, which are now held by the migration 

6082 orig_alloc = self.reportclient.get_allocations_for_consumer( 

6083 context, migration.uuid) 

6084 if not orig_alloc: 

6085 LOG.error('Did not find resource allocations for migration ' 

6086 '%s on source node %s. Unable to revert source node ' 

6087 'allocations back to the instance.', 

6088 migration.uuid, migration.source_node, instance=instance) 

6089 return False 

6090 

6091 LOG.info('Swapping old allocation on %(rp_uuids)s held by migration ' 

6092 '%(mig)s for instance', 

6093 {'rp_uuids': orig_alloc.keys(), 'mig': migration.uuid}, 

6094 instance=instance) 

6095 # FIXME(gibi): This method is flawed in that it does not handle 

6096 # allocations against sharing providers in any special way. This leads 

6097 # to duplicate allocations against the sharing provider during 

6098 # migration. 

6099 # TODO(cdent): Should we be doing anything with return values here? 

6100 self.reportclient.move_allocations(context, migration.uuid, 

6101 instance.uuid) 

6102 return orig_alloc 

6103 

6104 def _prep_resize( 

6105 self, context, image, instance, flavor, filter_properties, node, 

6106 migration, request_spec, clean_shutdown=True, 

6107 ): 

6108 

6109 if not filter_properties: 6109 ↛ 6112line 6109 didn't jump to line 6112 because the condition on line 6109 was always true

6110 filter_properties = {} 

6111 

6112 if not instance.host: 

6113 self._set_instance_obj_error_state(instance) 

6114 msg = _('Instance has no source host') 

6115 raise exception.MigrationError(reason=msg) 

6116 

6117 same_host = instance.host == self.host 

6118 # if the flavor IDs match, it's migrate; otherwise resize 

6119 if same_host and flavor.id == instance['instance_type_id']: 

6120 # check driver whether support migrate to same host 

6121 if not self.driver.capabilities.get( 

6122 'supports_migrate_to_same_host', False): 

6123 # Raise InstanceFaultRollback so that the 

6124 # _error_out_instance_on_exception context manager in 

6125 # prep_resize will set the instance.vm_state properly. 

6126 raise exception.InstanceFaultRollback( 

6127 inner_exception=exception.UnableToMigrateToSelf( 

6128 instance_id=instance.uuid, host=self.host)) 

6129 

6130 # NOTE(danms): Stash the new flavor to avoid having to 

6131 # look it up in the database later 

6132 instance.new_flavor = flavor 

6133 # NOTE(mriedem): Stash the old vm_state so we can set the 

6134 # resized/reverted instance back to the same state later. 

6135 vm_state = instance.vm_state 

6136 LOG.debug('Stashing vm_state: %s', vm_state, instance=instance) 

6137 instance.system_metadata['old_vm_state'] = vm_state 

6138 instance.save() 

6139 

6140 if not isinstance(request_spec, objects.RequestSpec): 

6141 # Prior to compute RPC API 5.1 conductor would pass a legacy dict 

6142 # version of the request spec to compute and since Stein compute 

6143 # could be sending that back to conductor on reschedule, so if we 

6144 # got a dict convert it to an object. 

6145 # TODO(mriedem): We can drop this compat code when we only support 

6146 # compute RPC API >=6.0. 

6147 request_spec = objects.RequestSpec.from_primitives( 

6148 context, request_spec, filter_properties) 

6149 # We don't have to set the new flavor on the request spec because 

6150 # if we got here it was due to a reschedule from the compute and 

6151 # the request spec would already have the new flavor in it from the 

6152 # else block below. 

6153 

6154 provider_mapping = self._get_request_group_mapping(request_spec) 

6155 

6156 if provider_mapping: 

6157 try: 

6158 compute_utils.update_pci_request_with_placement_allocations( 

6159 context, 

6160 self.reportclient, 

6161 instance.pci_requests.requests, 

6162 provider_mapping, 

6163 ) 

6164 except (exception.AmbiguousResourceProviderForPCIRequest, 

6165 exception.UnexpectedResourceProviderNameForPCIRequest 

6166 ) as e: 

6167 raise exception.BuildAbortException( 

6168 reason=str(e), instance_uuid=instance.uuid) 

6169 

6170 limits = filter_properties.get('limits', {}) 

6171 allocs = self.reportclient.get_allocations_for_consumer( 

6172 context, instance.uuid) 

6173 with self.rt.resize_claim( 

6174 context, instance, flavor, node, migration, allocs, 

6175 image_meta=image, limits=limits, 

6176 ) as claim: 

6177 LOG.info('Migrating', instance=instance) 

6178 # RPC cast to the source host to start the actual resize/migration. 

6179 self.compute_rpcapi.resize_instance( 

6180 context, instance, claim.migration, image, 

6181 flavor, request_spec, clean_shutdown) 

6182 

6183 def _send_prep_resize_notifications( 

6184 self, context, instance, phase, flavor): 

6185 """Send "resize.prep.*" notifications. 

6186 

6187 :param context: nova auth request context 

6188 :param instance: The instance being resized 

6189 :param phase: The phase of the action (NotificationPhase enum) 

6190 :param flavor: The (new) flavor for the resize (same as existing 

6191 instance.flavor for a cold migration) 

6192 """ 

6193 # Only send notify_usage_exists if it's the "start" phase. 

6194 if phase == fields.NotificationPhase.START: 

6195 compute_utils.notify_usage_exists( 

6196 self.notifier, context, instance, self.host, 

6197 current_period=True) 

6198 

6199 # Send extra usage info about the flavor if it's the "end" phase for 

6200 # the legacy unversioned notification. 

6201 extra_usage_info = None 

6202 if phase == fields.NotificationPhase.END: 

6203 extra_usage_info = dict( 

6204 new_instance_type=flavor.name, 

6205 new_instance_type_id=flavor.id) 

6206 self._notify_about_instance_usage( 

6207 context, instance, "resize.prep.%s" % phase, 

6208 extra_usage_info=extra_usage_info) 

6209 

6210 # Send the versioned notification. 

6211 compute_utils.notify_about_resize_prep_instance( 

6212 context, instance, self.host, phase, flavor) 

6213 

6214 @wrap_exception() 

6215 @reverts_task_state 

6216 @wrap_instance_event(prefix='compute') 

6217 @wrap_instance_fault 

6218 def prep_resize(self, context, image, instance, flavor, 

6219 request_spec, filter_properties, node, 

6220 clean_shutdown, migration, host_list): 

6221 """Initiates the process of moving a running instance to another host. 

6222 

6223 Possibly changes the VCPU, RAM and disk size in the process. 

6224 

6225 This is initiated from conductor and runs on the destination host. 

6226 

6227 The main purpose of this method is performing some checks on the 

6228 destination host and making a claim for resources. If the claim fails 

6229 then a reschedule to another host may be attempted which involves 

6230 calling back to conductor to start the process over again. 

6231 """ 

6232 if node is None: 

6233 node = self._get_nodename(instance, refresh=True) 

6234 

6235 # Pass instance_state=instance.vm_state because we can resize 

6236 # a STOPPED server and we don't want to set it back to ACTIVE 

6237 # in case _prep_resize fails. 

6238 instance_state = instance.vm_state 

6239 with self._error_out_instance_on_exception( 

6240 context, instance, instance_state=instance_state), \ 

6241 errors_out_migration_ctxt(migration): 

6242 

6243 self._send_prep_resize_notifications( 

6244 context, instance, fields.NotificationPhase.START, 

6245 flavor) 

6246 try: 

6247 scheduler_hints = self._get_scheduler_hints(filter_properties, 

6248 request_spec) 

6249 # Error out if this host cannot accept the new instance due 

6250 # to anti-affinity. At this point the migration is already 

6251 # in-progress, so this is the definitive moment to abort due to 

6252 # the policy violation. Also, exploding here is covered by the 

6253 # cleanup methods in except block. 

6254 try: 

6255 self._validate_instance_group_policy(context, instance, 

6256 scheduler_hints) 

6257 except exception.RescheduledException as e: 

6258 raise exception.InstanceFaultRollback(inner_exception=e) 

6259 

6260 self._prep_resize(context, image, instance, 

6261 flavor, filter_properties, 

6262 node, migration, request_spec, 

6263 clean_shutdown) 

6264 except exception.BuildAbortException: 

6265 # NOTE(gibi): We failed 

6266 # update_pci_request_with_placement_allocations so 

6267 # there is no reason to re-schedule. Just revert the allocation 

6268 # and fail the migration. 

6269 with excutils.save_and_reraise_exception(): 

6270 self._revert_allocation(context, instance, migration) 

6271 except Exception: 

6272 # Since we hit a failure, we're either rescheduling or dead 

6273 # and either way we need to cleanup any allocations created 

6274 # by the scheduler for the destination node. 

6275 self._revert_allocation(context, instance, migration) 

6276 # try to re-schedule the resize elsewhere: 

6277 exc_info = sys.exc_info() 

6278 self._reschedule_resize_or_reraise(context, instance, 

6279 exc_info, flavor, request_spec, 

6280 filter_properties, host_list) 

6281 finally: 

6282 self._send_prep_resize_notifications( 

6283 context, instance, fields.NotificationPhase.END, 

6284 flavor) 

6285 

6286 def _reschedule_resize_or_reraise(self, context, instance, exc_info, 

6287 flavor, request_spec, filter_properties, host_list): 

6288 """Try to re-schedule the resize or re-raise the original error to 

6289 error out the instance. 

6290 """ 

6291 if not filter_properties: 

6292 filter_properties = {} 

6293 

6294 rescheduled = False 

6295 instance_uuid = instance.uuid 

6296 

6297 try: 

6298 retry = filter_properties.get('retry') 

6299 if retry: 

6300 LOG.debug('Rescheduling, attempt %d', retry['num_attempts'], 

6301 instance_uuid=instance_uuid) 

6302 

6303 # reset the task state 

6304 task_state = task_states.RESIZE_PREP 

6305 self._instance_update(context, instance, task_state=task_state) 

6306 

6307 if exc_info: 6307 ↛ 6313line 6307 didn't jump to line 6313 because the condition on line 6307 was always true

6308 # stringify to avoid circular ref problem in json 

6309 # serialization 

6310 retry['exc'] = traceback.format_exception_only( 

6311 exc_info[0], exc_info[1]) 

6312 

6313 scheduler_hint = {'filter_properties': filter_properties} 

6314 

6315 self.compute_task_api.resize_instance( 

6316 context, instance, scheduler_hint, flavor, 

6317 request_spec=request_spec, host_list=host_list) 

6318 

6319 rescheduled = True 

6320 else: 

6321 # no retry information, do not reschedule. 

6322 LOG.debug('Retry info not present, will not reschedule', 

6323 instance_uuid=instance_uuid) 

6324 rescheduled = False 

6325 except Exception as error: 

6326 rescheduled = False 

6327 LOG.exception("Error trying to reschedule", 

6328 instance_uuid=instance_uuid) 

6329 compute_utils.add_instance_fault_from_exc(context, 

6330 instance, error, 

6331 exc_info=sys.exc_info()) 

6332 self._notify_about_instance_usage(context, instance, 

6333 'resize.error', fault=error) 

6334 compute_utils.notify_about_instance_action( 

6335 context, instance, self.host, 

6336 action=fields.NotificationAction.RESIZE, 

6337 phase=fields.NotificationPhase.ERROR, 

6338 exception=error, 

6339 ) 

6340 

6341 if rescheduled: 

6342 self._log_original_error(exc_info, instance_uuid) 

6343 compute_utils.add_instance_fault_from_exc(context, 

6344 instance, exc_info[1], exc_info=exc_info) 

6345 self._notify_about_instance_usage(context, instance, 

6346 'resize.error', fault=exc_info[1]) 

6347 compute_utils.notify_about_instance_action( 

6348 context, instance, self.host, 

6349 action=fields.NotificationAction.RESIZE, 

6350 phase=fields.NotificationPhase.ERROR, 

6351 exception=exc_info[1], 

6352 ) 

6353 else: 

6354 # not re-scheduling 

6355 exc = exc_info[1] or exc_info[0]() 

6356 if exc.__traceback__ is not exc_info[2]: 6356 ↛ 6357line 6356 didn't jump to line 6357 because the condition on line 6356 was never true

6357 raise exc.with_traceback(exc_info[2]) 

6358 raise exc 

6359 

6360 @messaging.expected_exceptions(exception.MigrationPreCheckError) 

6361 @wrap_exception() 

6362 @wrap_instance_event(prefix='compute') 

6363 @wrap_instance_fault 

6364 def prep_snapshot_based_resize_at_dest( 

6365 self, ctxt, instance, flavor, nodename, migration, limits): 

6366 """Performs pre-cross-cell resize resource claim on the dest host. 

6367 

6368 This runs on the destination host in a cross-cell resize operation 

6369 before the resize is actually started. 

6370 

6371 Performs a resize_claim for resources that are not claimed in placement 

6372 like PCI devices and NUMA topology. 

6373 

6374 Note that this is different from same-cell prep_resize in that this: 

6375 

6376 * Does not RPC cast to the source compute, that is orchestrated from 

6377 conductor. 

6378 * This does not reschedule on failure, conductor handles that since 

6379 conductor is synchronously RPC calling this method. As such, the 

6380 reverts_task_state decorator is not used on this method. 

6381 

6382 :param ctxt: user auth request context 

6383 :param instance: the instance being resized 

6384 :param flavor: the flavor being resized to (unchanged for cold migrate) 

6385 :param nodename: Name of the target compute node 

6386 :param migration: nova.objects.Migration object for the operation 

6387 :param limits: nova.objects.SchedulerLimits object of resource limits 

6388 :returns: nova.objects.MigrationContext; the migration context created 

6389 on the destination host during the resize_claim. 

6390 :raises: nova.exception.MigrationPreCheckError if the pre-check 

6391 validation fails for the given host selection 

6392 """ 

6393 LOG.debug('Checking if we can cross-cell migrate instance to this ' 

6394 'host (%s).', self.host, instance=instance) 

6395 self._send_prep_resize_notifications( 

6396 ctxt, instance, fields.NotificationPhase.START, flavor) 

6397 # TODO(mriedem): update_pci_request_with_placement_allocations 

6398 # should be called here if the request spec has request group mappings, 

6399 # e.g. for things like QoS ports with resource requests. Do it outside 

6400 # the try/except so if it raises BuildAbortException we do not attempt 

6401 # to reschedule. 

6402 try: 

6403 # Get the allocations within the try/except block in case we get 

6404 # an error so MigrationPreCheckError is raised up. 

6405 allocations = self.reportclient.get_allocs_for_consumer( 

6406 ctxt, instance.uuid)['allocations'] 

6407 # Claim resources on this target host using the new flavor which 

6408 # will create the MigrationContext object. Note that in the future 

6409 # if we want to do other validation here we should do it within 

6410 # the MoveClaim context so we can drop the claim if anything fails. 

6411 self.rt.resize_claim( 

6412 ctxt, instance, flavor, nodename, migration, allocations, 

6413 image_meta=instance.image_meta, limits=limits) 

6414 except Exception as ex: 

6415 err = str(ex) 

6416 LOG.warning( 

6417 'Cross-cell resize pre-checks failed for this host (%s). ' 

6418 'Cleaning up. Failure: %s', self.host, err, 

6419 instance=instance, exc_info=True) 

6420 raise exception.MigrationPreCheckError( 

6421 reason=(_("Pre-checks failed on host '%(host)s'. " 

6422 "Error: %(error)s") % 

6423 {'host': self.host, 'error': err})) 

6424 finally: 

6425 self._send_prep_resize_notifications( 

6426 ctxt, instance, fields.NotificationPhase.END, flavor) 

6427 

6428 # ResourceTracker.resize_claim() sets instance.migration_context. 

6429 return instance.migration_context 

6430 

6431 @messaging.expected_exceptions(exception.InstancePowerOffFailure) 

6432 @wrap_exception() 

6433 @reverts_task_state 

6434 @wrap_instance_event(prefix='compute') 

6435 @errors_out_migration 

6436 @wrap_instance_fault 

6437 def prep_snapshot_based_resize_at_source( 

6438 self, ctxt, instance, migration, snapshot_id=None): 

6439 """Prepares the instance at the source host for cross-cell resize 

6440 

6441 Performs actions like powering off the guest, upload snapshot data if 

6442 the instance is not volume-backed, disconnecting volumes, unplugging 

6443 VIFs and activating the destination host port bindings. 

6444 

6445 :param ctxt: user auth request context targeted at source cell 

6446 :param instance: nova.objects.Instance; the instance being resized. 

6447 The expected instance.task_state is "resize_migrating" when calling 

6448 this method, and the expected task_state upon successful completion 

6449 is "resize_migrated". 

6450 :param migration: nova.objects.Migration object for the operation. 

6451 The expected migration.status is "pre-migrating" when calling this 

6452 method and the expected status upon successful completion is 

6453 "post-migrating". 

6454 :param snapshot_id: ID of the image snapshot to upload if not a 

6455 volume-backed instance 

6456 :raises: nova.exception.InstancePowerOffFailure if stopping the 

6457 instance fails 

6458 """ 

6459 LOG.info('Preparing for snapshot based resize on source host %s.', 

6460 self.host, instance=instance) 

6461 # Note that if anything fails here, the migration-based allocations 

6462 # created in conductor should be reverted by conductor as well, 

6463 # see MigrationTask.rollback. 

6464 self._prep_snapshot_based_resize_at_source( 

6465 ctxt, instance, migration, snapshot_id=snapshot_id) 

6466 

6467 @delete_image_on_error 

6468 def _snapshot_for_resize(self, ctxt, image_id, instance): 

6469 """Uploads snapshot for the instance during a snapshot-based resize 

6470 

6471 If the snapshot operation fails the image will be deleted. 

6472 

6473 :param ctxt: the nova auth request context for the resize operation 

6474 :param image_id: the snapshot image ID 

6475 :param instance: the instance to snapshot/resize 

6476 """ 

6477 LOG.debug('Uploading snapshot data for image %s', image_id, 

6478 instance=instance) 

6479 # Note that we do not track the snapshot phase task states 

6480 # during resize since we do not want to reflect those into the 

6481 # actual instance.task_state. 

6482 update_task_state = lambda *args, **kwargs: None 

6483 with timeutils.StopWatch() as timer: 

6484 self.driver.snapshot(ctxt, instance, image_id, update_task_state) 

6485 LOG.debug('Took %0.2f seconds to snapshot the instance on ' 

6486 'the hypervisor.', timer.elapsed(), instance=instance) 

6487 

6488 def _prep_snapshot_based_resize_at_source( 

6489 self, ctxt, instance, migration, snapshot_id=None): 

6490 """Private method for prep_snapshot_based_resize_at_source so calling 

6491 code can handle errors and perform rollbacks as necessary. 

6492 """ 

6493 # Fetch and update the instance.info_cache. 

6494 network_info = self.network_api.get_instance_nw_info(ctxt, instance) 

6495 # Get the BDMs attached to this instance on this source host. 

6496 bdms = instance.get_bdms() 

6497 # Send the resize.start notification. 

6498 self._send_resize_instance_notifications( 

6499 ctxt, instance, bdms, network_info, fields.NotificationPhase.START) 

6500 # Update the migration status from "pre-migrating" to "migrating". 

6501 migration.status = 'migrating' 

6502 migration.save() 

6503 

6504 # Since the instance is going to be left on the source host during the 

6505 # resize, we need to power it off so we do not have the instance 

6506 # potentially running in two places. 

6507 LOG.debug('Stopping instance', instance=instance) 

6508 try: 

6509 self._power_off_instance(ctxt, instance) 

6510 except Exception as e: 

6511 LOG.exception('Failed to power off instance.', instance=instance) 

6512 raise exception.InstancePowerOffFailure(reason=str(e)) 

6513 instance.power_state = self._get_power_state(instance) 

6514 

6515 # If a snapshot image ID was provided, we need to snapshot the guest 

6516 # disk image and upload it to the image service. 

6517 if snapshot_id: 

6518 self._snapshot_for_resize(ctxt, snapshot_id, instance) 

6519 

6520 block_device_info = self._get_instance_block_device_info( 

6521 ctxt, instance, bdms=bdms) 

6522 

6523 # If something fails at this point the instance must go to ERROR 

6524 # status for operator intervention or to reboot/rebuild the instance. 

6525 with self._error_out_instance_on_exception( 

6526 ctxt, instance, instance_state=vm_states.ERROR): 

6527 

6528 # Destroy the guest on the source host which will disconnect 

6529 # volumes and unplug VIFs. Note that we DO NOT destroy disks since 

6530 # we want to leave those on the source host in case of a later 

6531 # failure and disks are needed to recover the guest or in case the 

6532 # resize is reverted. 

6533 LOG.debug('Destroying guest on source host but retaining disks.', 

6534 instance=instance) 

6535 self.driver.destroy( 

6536 ctxt, instance, network_info, 

6537 block_device_info=block_device_info, destroy_disks=False) 

6538 

6539 # At this point the volumes are disconnected from this source host. 

6540 # Delete the old volume attachment records and create new empty 

6541 # ones which will be used later if the resize is reverted. 

6542 LOG.debug('Deleting volume attachments for the source host.', 

6543 instance=instance) 

6544 self._terminate_volume_connections(ctxt, instance, bdms) 

6545 

6546 # At this point the VIFs are unplugged from this source host. 

6547 # Activate the dest host port bindings created by conductor. 

6548 self.network_api.migrate_instance_start(ctxt, instance, migration) 

6549 

6550 # Update the migration status from "migrating" to "post-migrating". 

6551 migration.status = 'post-migrating' 

6552 migration.save() 

6553 

6554 # At this point, the traditional resize_instance would update the 

6555 # instance host/node values to point at the dest host/node because 

6556 # that is where the disk is transferred during resize_instance, but 

6557 # with cross-cell resize the instance is not yet at the dest host 

6558 # so we do not make that update here. 

6559 instance.task_state = task_states.RESIZE_MIGRATED 

6560 instance.save(expected_task_state=task_states.RESIZE_MIGRATING) 

6561 

6562 self._send_resize_instance_notifications( 

6563 ctxt, instance, bdms, network_info, 

6564 fields.NotificationPhase.END) 

6565 self.instance_events.clear_events_for_instance(instance) 

6566 

6567 @wrap_exception() 

6568 @reverts_task_state 

6569 @wrap_instance_event(prefix='compute') 

6570 @wrap_instance_fault 

6571 def resize_instance(self, context, instance, image, 

6572 migration, flavor, clean_shutdown, 

6573 request_spec): 

6574 """Starts the migration of a running instance to another host. 

6575 

6576 This is initiated from the destination host's ``prep_resize`` routine 

6577 and runs on the source host. 

6578 """ 

6579 try: 

6580 self._resize_instance(context, instance, image, migration, 

6581 flavor, clean_shutdown, request_spec) 

6582 except Exception: 

6583 with excutils.save_and_reraise_exception(): 

6584 self._revert_allocation(context, instance, migration) 

6585 

6586 def _resize_instance( 

6587 self, context, instance, image, migration, flavor, 

6588 clean_shutdown, request_spec, 

6589 ): 

6590 # Pass instance_state=instance.vm_state because we can resize 

6591 # a STOPPED server and we don't want to set it back to ACTIVE 

6592 # in case migrate_disk_and_power_off raises InstanceFaultRollback. 

6593 instance_state = instance.vm_state 

6594 with self._error_out_instance_on_exception( 

6595 context, instance, instance_state=instance_state), \ 

6596 errors_out_migration_ctxt(migration): 

6597 network_info = self.network_api.get_instance_nw_info(context, 

6598 instance) 

6599 

6600 migration.status = 'migrating' 

6601 migration.save() 

6602 

6603 instance.task_state = task_states.RESIZE_MIGRATING 

6604 instance.save(expected_task_state=task_states.RESIZE_PREP) 

6605 

6606 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

6607 context, instance.uuid) 

6608 self._send_resize_instance_notifications( 

6609 context, instance, bdms, network_info, 

6610 fields.NotificationPhase.START) 

6611 

6612 block_device_info = self._get_instance_block_device_info( 

6613 context, instance, bdms=bdms) 

6614 

6615 timeout, retry_interval = self._get_power_off_values( 

6616 instance, clean_shutdown) 

6617 disk_info = self.driver.migrate_disk_and_power_off( 

6618 context, instance, migration.dest_host, 

6619 flavor, network_info, 

6620 block_device_info, 

6621 timeout, retry_interval) 

6622 

6623 self._terminate_volume_connections(context, instance, bdms) 

6624 

6625 self.network_api.migrate_instance_start(context, 

6626 instance, 

6627 migration) 

6628 

6629 migration.status = 'post-migrating' 

6630 migration.save() 

6631 

6632 instance.host = migration.dest_compute 

6633 instance.node = migration.dest_node 

6634 # NOTE(danms): This could fail if the dest_compute_id was not set 

6635 # by the sending host *and* migration.dest_node does not point to 

6636 # a legit node on the dest_compute. Since that would be very bad, 

6637 # let the ComputeHostNotFound raise from here. 

6638 instance.compute_id = migration.get_dest_compute_id() 

6639 # NOTE(gibi): as the instance now tracked on the destination we 

6640 # have to make sure that the source compute resource track can 

6641 # track this instance as a migration. For that the resource tracker 

6642 # needs to see the old_flavor set on the instance. The old_flavor 

6643 # setting used to be done on the destination host in finish_resize 

6644 # but that is racy with a source host update_available_resource 

6645 # periodic run 

6646 instance.old_flavor = instance.flavor 

6647 instance.task_state = task_states.RESIZE_MIGRATED 

6648 instance.save(expected_task_state=task_states.RESIZE_MIGRATING) 

6649 

6650 # RPC cast to the destination host to finish the resize/migration. 

6651 self.compute_rpcapi.finish_resize(context, instance, 

6652 migration, image, disk_info, migration.dest_compute, 

6653 request_spec) 

6654 

6655 self._send_resize_instance_notifications( 

6656 context, instance, bdms, network_info, 

6657 fields.NotificationPhase.END) 

6658 self.instance_events.clear_events_for_instance(instance) 

6659 

6660 def _send_resize_instance_notifications( 

6661 self, context, instance, bdms, network_info, phase): 

6662 """Send "resize.(start|end)" notifications. 

6663 

6664 :param context: nova auth request context 

6665 :param instance: The instance being resized 

6666 :param bdms: BlockDeviceMappingList for the BDMs associated with the 

6667 instance 

6668 :param network_info: NetworkInfo for the instance info cache of ports 

6669 :param phase: The phase of the action (NotificationPhase enum, either 

6670 ``start`` or ``end``) 

6671 """ 

6672 action = fields.NotificationAction.RESIZE 

6673 # Send the legacy unversioned notification. 

6674 self._notify_about_instance_usage( 

6675 context, instance, "%s.%s" % (action, phase), 

6676 network_info=network_info) 

6677 # Send the versioned notification. 

6678 compute_utils.notify_about_instance_action( 

6679 context, instance, self.host, action=action, phase=phase, 

6680 bdms=bdms) 

6681 

6682 def _terminate_volume_connections(self, context, instance, bdms): 

6683 connector = None 

6684 for bdm in bdms: 

6685 if bdm.is_volume: 

6686 if bdm.attachment_id: 

6687 # NOTE(jdg): So here's the thing, the idea behind the new 

6688 # attach API's was to have a new code fork/path that we 

6689 # followed, we're not going to do that so we have to do 

6690 # some extra work in here to make it *behave* just like the 

6691 # old code. Cinder doesn't allow disconnect/reconnect (you 

6692 # just delete the attachment and get a new one) 

6693 # attachments in the new attach code so we have to do 

6694 # a delete and create without a connector (reserve), 

6695 # in other words, beware 

6696 attachment_id = self.volume_api.attachment_create( 

6697 context, bdm.volume_id, instance.uuid)['id'] 

6698 self.volume_api.attachment_delete(context, 

6699 bdm.attachment_id) 

6700 bdm.attachment_id = attachment_id 

6701 bdm.save() 

6702 

6703 else: 

6704 if connector is None: 

6705 connector = self.driver.get_volume_connector(instance) 

6706 self.volume_api.terminate_connection(context, 

6707 bdm.volume_id, 

6708 connector) 

6709 

6710 @staticmethod 

6711 def _set_instance_info(instance, flavor): 

6712 instance.instance_type_id = flavor.id 

6713 instance.memory_mb = flavor.memory_mb 

6714 instance.vcpus = flavor.vcpus 

6715 instance.root_gb = flavor.root_gb 

6716 instance.ephemeral_gb = flavor.ephemeral_gb 

6717 instance.flavor = flavor 

6718 

6719 def _update_volume_attachments(self, context, instance, bdms): 

6720 """Updates volume attachments using the virt driver host connector. 

6721 

6722 :param context: nova.context.RequestContext - user request context 

6723 :param instance: nova.objects.Instance 

6724 :param bdms: nova.objects.BlockDeviceMappingList - the list of block 

6725 device mappings for the given instance 

6726 """ 

6727 if bdms: 

6728 connector = None 

6729 for bdm in bdms: 

6730 if bdm.is_volume and bdm.attachment_id: 

6731 if connector is None: 6731 ↛ 6733line 6731 didn't jump to line 6733 because the condition on line 6731 was always true

6732 connector = self.driver.get_volume_connector(instance) 

6733 self.volume_api.attachment_update( 

6734 context, bdm.attachment_id, connector, bdm.device_name) 

6735 

6736 def _complete_volume_attachments(self, context, bdms): 

6737 """Completes volume attachments for the instance 

6738 

6739 :param context: nova.context.RequestContext - user request context 

6740 :param bdms: nova.objects.BlockDeviceMappingList - the list of block 

6741 device mappings for the given instance 

6742 """ 

6743 if bdms: 

6744 for bdm in bdms: 

6745 if bdm.is_volume and bdm.attachment_id: 

6746 self.volume_api.attachment_complete( 

6747 context, bdm.attachment_id) 

6748 

6749 def _finish_resize(self, context, instance, migration, disk_info, 

6750 image_meta, bdms, request_spec): 

6751 resize_instance = False # indicates disks have been resized 

6752 old_instance_type_id = migration.old_instance_type_id 

6753 new_instance_type_id = migration.new_instance_type_id 

6754 old_flavor = instance.flavor # the current flavor is now old 

6755 # NOTE(mriedem): Get the old_vm_state so we know if we should 

6756 # power on the instance. If old_vm_state is not set we need to default 

6757 # to ACTIVE for backwards compatibility 

6758 old_vm_state = instance.system_metadata.get('old_vm_state', 

6759 vm_states.ACTIVE) 

6760 # NOTE(gibi): this is already set by the resize_instance on the source 

6761 # node before calling finish_resize on destination but during upgrade 

6762 # it can be that the source node is not having the fix for bug 1944759 

6763 # yet. This assignment can be removed in Z release. 

6764 instance.old_flavor = old_flavor 

6765 

6766 if old_instance_type_id != new_instance_type_id: 6766 ↛ 6775line 6766 didn't jump to line 6775 because the condition on line 6766 was always true

6767 new_flavor = instance.new_flavor # this is set in _prep_resize 

6768 # Set the flavor-related fields on the instance object including 

6769 # making instance.flavor = new_flavor. 

6770 self._set_instance_info(instance, new_flavor) 

6771 for key in ('root_gb', 'swap', 'ephemeral_gb'): 

6772 if old_flavor[key] != new_flavor[key]: 

6773 resize_instance = True 

6774 break 

6775 instance.apply_migration_context() 

6776 

6777 # NOTE(tr3buchet): setup networks on destination host 

6778 self.network_api.setup_networks_on_host(context, instance, 

6779 migration.dest_compute) 

6780 provider_mappings = self._get_request_group_mapping(request_spec) 

6781 

6782 # For neutron, migrate_instance_finish updates port bindings for this 

6783 # host including any PCI devices claimed for SR-IOV ports. 

6784 self.network_api.migrate_instance_finish( 

6785 context, instance, migration, provider_mappings) 

6786 

6787 network_info = self.network_api.get_instance_nw_info(context, instance) 

6788 

6789 instance.task_state = task_states.RESIZE_FINISH 

6790 instance.save(expected_task_state=task_states.RESIZE_MIGRATED) 

6791 

6792 self._send_finish_resize_notifications( 

6793 context, instance, bdms, network_info, 

6794 fields.NotificationPhase.START) 

6795 

6796 # We need to update any volume attachments using the destination 

6797 # host connector so that we can update the BDM.connection_info 

6798 # before calling driver.finish_migration otherwise the driver 

6799 # won't know how to connect the volumes to this host. 

6800 # Note that _get_instance_block_device_info with 

6801 # refresh_conn_info=True will update the BDM.connection_info value 

6802 # in the database so we must do this before calling that method. 

6803 self._update_volume_attachments(context, instance, bdms) 

6804 

6805 block_device_info = self._get_instance_block_device_info( 

6806 context, instance, refresh_conn_info=True, bdms=bdms) 

6807 

6808 # NOTE(mriedem): If the original vm_state was STOPPED, we don't 

6809 # automatically power on the instance after it's migrated 

6810 power_on = old_vm_state != vm_states.STOPPED 

6811 

6812 # NOTE(sbauza): During a migration, the original allocation is against 

6813 # the migration UUID while the target allocation (for the destination 

6814 # node) is related to the instance UUID, so here we need to pass the 

6815 # new ones. 

6816 allocations = self.reportclient.get_allocs_for_consumer( 

6817 context, instance.uuid)['allocations'] 

6818 

6819 try: 

6820 self.driver.finish_migration(context, migration, instance, 

6821 disk_info, 

6822 network_info, 

6823 image_meta, resize_instance, 

6824 allocations, 

6825 block_device_info, power_on) 

6826 except Exception: 

6827 # Note that we do not rollback port bindings to the source host 

6828 # because resize_instance (on the source host) updated the 

6829 # instance.host to point to *this* host (the destination host) 

6830 # so the port bindings pointing at this host are correct even 

6831 # though we failed to create the guest. 

6832 with excutils.save_and_reraise_exception(): 

6833 # If we failed to create the guest on this host, reset the 

6834 # instance flavor-related fields to the old flavor. An 

6835 # error handler like reverts_task_state will save the changes. 

6836 if old_instance_type_id != new_instance_type_id: 6836 ↛ 6840line 6836 didn't jump to line 6840

6837 self._set_instance_info(instance, old_flavor) 

6838 

6839 # Now complete any volume attachments that were previously updated. 

6840 self._complete_volume_attachments(context, bdms) 

6841 

6842 migration.status = 'finished' 

6843 migration.save() 

6844 

6845 instance.vm_state = vm_states.RESIZED 

6846 instance.task_state = None 

6847 instance.launched_at = timeutils.utcnow() 

6848 instance.save(expected_task_state=task_states.RESIZE_FINISH) 

6849 

6850 return network_info 

6851 

6852 @wrap_exception() 

6853 @reverts_task_state 

6854 @wrap_instance_event(prefix='compute') 

6855 @errors_out_migration 

6856 @wrap_instance_fault 

6857 def finish_resize(self, context, disk_info, image, instance, 

6858 migration, request_spec): 

6859 """Completes the migration process. 

6860 

6861 Sets up the newly transferred disk and turns on the instance at its 

6862 new host machine. 

6863 

6864 """ 

6865 try: 

6866 self._finish_resize_helper(context, disk_info, image, instance, 

6867 migration, request_spec) 

6868 except Exception: 

6869 with excutils.save_and_reraise_exception(): 

6870 # At this point, resize_instance (which runs on the source) has 

6871 # already updated the instance host/node values to point to 

6872 # this (the dest) compute, so we need to leave the allocations 

6873 # against the dest node resource provider intact and drop the 

6874 # allocations against the source node resource provider. If the 

6875 # user tries to recover the server by hard rebooting it, it 

6876 # will happen on this host so that's where the allocations 

6877 # should go. Note that this is the same method called from 

6878 # confirm_resize to cleanup the source node allocations held 

6879 # by the migration record. 

6880 LOG.info('Deleting allocations for old flavor on source node ' 

6881 '%s after finish_resize failure. You may be able to ' 

6882 'recover the instance by hard rebooting it.', 

6883 migration.source_compute, instance=instance) 

6884 self._delete_allocation_after_move( 

6885 context, instance, migration) 

6886 

6887 def _finish_resize_helper(self, context, disk_info, image, instance, 

6888 migration, request_spec): 

6889 """Completes the migration process. 

6890 

6891 The caller must revert the instance's allocations if the migration 

6892 process failed. 

6893 """ 

6894 bdms = self._update_bdm_for_swap_to_finish_resize(context, instance) 

6895 

6896 with self._error_out_instance_on_exception(context, instance): 

6897 image_meta = objects.ImageMeta.from_dict(image) 

6898 network_info = self._finish_resize(context, instance, migration, 

6899 disk_info, image_meta, bdms, 

6900 request_spec) 

6901 

6902 # TODO(melwitt): We should clean up instance console tokens here. The 

6903 # instance is on a new host and will need to establish a new console 

6904 # connection. 

6905 self._update_scheduler_instance_info(context, instance) 

6906 self._send_finish_resize_notifications( 

6907 context, instance, bdms, network_info, 

6908 fields.NotificationPhase.END) 

6909 

6910 def _send_finish_resize_notifications( 

6911 self, context, instance, bdms, network_info, phase): 

6912 """Send notifications for the finish_resize flow. 

6913 

6914 :param context: nova auth request context 

6915 :param instance: The instance being resized 

6916 :param bdms: BlockDeviceMappingList for the BDMs associated with the 

6917 instance 

6918 :param network_info: NetworkInfo for the instance info cache of ports 

6919 :param phase: The phase of the action (NotificationPhase enum, either 

6920 ``start`` or ``end``) 

6921 """ 

6922 # Send the legacy unversioned notification. 

6923 self._notify_about_instance_usage( 

6924 context, instance, "finish_resize.%s" % phase, 

6925 network_info=network_info) 

6926 # Send the versioned notification. 

6927 compute_utils.notify_about_instance_action( 

6928 context, instance, self.host, 

6929 action=fields.NotificationAction.RESIZE_FINISH, phase=phase, 

6930 bdms=bdms) 

6931 

6932 @wrap_exception() 

6933 @reverts_task_state 

6934 @wrap_instance_event(prefix='compute') 

6935 @errors_out_migration 

6936 @wrap_instance_fault 

6937 def finish_snapshot_based_resize_at_dest( 

6938 self, ctxt, instance, migration, snapshot_id): 

6939 """Finishes the snapshot-based resize at the destination compute. 

6940 

6941 Sets up block devices and networking on the destination compute and 

6942 spawns the guest. 

6943 

6944 :param ctxt: nova auth request context targeted at the target cell DB 

6945 :param instance: The Instance object being resized with the 

6946 ``migration_context`` field set. Upon successful completion of this 

6947 method the vm_state should be "resized", the task_state should be 

6948 None, and migration context, host/node and flavor-related fields 

6949 should be set on the instance. 

6950 :param migration: The Migration object for this resize operation. Upon 

6951 successful completion of this method the migration status should 

6952 be "finished". 

6953 :param snapshot_id: ID of the image snapshot created for a 

6954 non-volume-backed instance, else None. 

6955 """ 

6956 LOG.info('Finishing snapshot based resize on destination host %s.', 

6957 self.host, instance=instance) 

6958 with self._error_out_instance_on_exception(ctxt, instance): 

6959 # Note that if anything fails here, the migration-based allocations 

6960 # created in conductor should be reverted by conductor as well, 

6961 # see MigrationTask.rollback. 

6962 self._finish_snapshot_based_resize_at_dest( 

6963 ctxt, instance, migration, snapshot_id) 

6964 

6965 def _finish_snapshot_based_resize_at_dest( 

6966 self, ctxt, instance, migration, snapshot_id): 

6967 """Private variant of finish_snapshot_based_resize_at_dest so the 

6968 caller can handle reverting resource allocations on failure and perform 

6969 other generic error handling. 

6970 """ 

6971 # Figure out the image metadata to use when spawning the guest. 

6972 origin_image_ref = instance.image_ref 

6973 if snapshot_id: 

6974 instance.image_ref = snapshot_id 

6975 image_meta = objects.ImageMeta.from_image_ref( 

6976 ctxt, self.image_api, snapshot_id) 

6977 else: 

6978 # Just use what is already on the volume-backed instance. 

6979 image_meta = instance.image_meta 

6980 

6981 resize = migration.migration_type == 'resize' 

6982 instance.old_flavor = instance.flavor 

6983 if resize: 6983 ↛ 6991line 6983 didn't jump to line 6991 because the condition on line 6983 was always true

6984 flavor = instance.new_flavor 

6985 # If we are resizing to a new flavor we need to set the 

6986 # flavor-related fields on the instance. 

6987 # NOTE(mriedem): This is likely where storing old/new_flavor on 

6988 # the MigrationContext would make this cleaner. 

6989 self._set_instance_info(instance, flavor) 

6990 

6991 instance.apply_migration_context() 

6992 instance.task_state = task_states.RESIZE_FINISH 

6993 instance.save(expected_task_state=task_states.RESIZE_MIGRATED) 

6994 

6995 # This seems a bit late to be sending the start notification but 

6996 # it is what traditional resize has always done as well and it does 

6997 # contain the changes to the instance with the new_flavor and 

6998 # task_state. 

6999 bdms = instance.get_bdms() 

7000 network_info = instance.get_network_info() 

7001 self._send_finish_resize_notifications( 

7002 ctxt, instance, bdms, network_info, 

7003 fields.NotificationPhase.START) 

7004 

7005 # Setup volumes and networking and spawn the guest in the hypervisor. 

7006 self._finish_snapshot_based_resize_at_dest_spawn( 

7007 ctxt, instance, migration, image_meta, bdms) 

7008 

7009 # If we spawned from a temporary snapshot image we can delete that now, 

7010 # similar to how unshelve works. 

7011 if snapshot_id: 

7012 instance.image_ref = origin_image_ref 

7013 compute_utils.delete_image( 

7014 ctxt, instance, self.image_api, snapshot_id) 

7015 

7016 migration.status = 'finished' 

7017 migration.save() 

7018 

7019 self._update_instance_after_spawn(instance, vm_state=vm_states.RESIZED) 

7020 # Setting the host/node values will make the ResourceTracker continue 

7021 # to track usage for this instance on this host. 

7022 instance.host = migration.dest_compute 

7023 # NOTE(danms): This could fail if we somehow ended up on the wrong 

7024 # host without the desired node. That's a big problem, so let the 

7025 # ComputeHostNotFound raise from here. 

7026 cn = self.rt.get_node_by_name(migration.dest_node) 

7027 instance.compute_id = cn.id 

7028 instance.node = cn.hypervisor_hostname 

7029 instance.save(expected_task_state=task_states.RESIZE_FINISH) 

7030 

7031 # Broadcast to all schedulers that the instance is on this host. 

7032 self._update_scheduler_instance_info(ctxt, instance) 

7033 self._send_finish_resize_notifications( 

7034 ctxt, instance, bdms, network_info, 

7035 fields.NotificationPhase.END) 

7036 

7037 def _finish_snapshot_based_resize_at_dest_spawn( 

7038 self, ctxt, instance, migration, image_meta, bdms): 

7039 """Sets up volumes and networking and spawns the guest on the dest host 

7040 

7041 If the instance was stopped when the resize was initiated the guest 

7042 will be created but remain in a shutdown power state. 

7043 

7044 If the spawn fails, port bindings are rolled back to the source host 

7045 and volume connections are terminated for this dest host. 

7046 

7047 :param ctxt: nova auth request context 

7048 :param instance: Instance object being migrated 

7049 :param migration: Migration object for the operation 

7050 :param image_meta: ImageMeta object used during driver.spawn 

7051 :param bdms: BlockDeviceMappingList of BDMs for the instance 

7052 """ 

7053 # Update the volume attachments using this host's connector. 

7054 # That will update the BlockDeviceMapping.connection_info which 

7055 # will be used to connect the volumes on this host during spawn(). 

7056 block_device_info = self._prep_block_device(ctxt, instance, bdms) 

7057 

7058 allocations = self.reportclient.get_allocations_for_consumer( 

7059 ctxt, instance.uuid) 

7060 

7061 # We do not call self.network_api.setup_networks_on_host here because 

7062 # for neutron that sets up the port migration profile which is only 

7063 # used during live migration with DVR. Yes it is gross knowing what 

7064 # that method does internally. We could change this when bug 1814837 

7065 # is fixed if setup_networks_on_host is made smarter by passing the 

7066 # migration record and the method checks the migration_type. 

7067 

7068 # Activate the port bindings for this host. 

7069 # FIXME(mriedem): We're going to have the same issue as bug 1813789 

7070 # here because this will update the port bindings and send the 

7071 # network-vif-plugged event and that means when driver.spawn waits for 

7072 # it we might have already gotten the event and neutron won't send 

7073 # another one so we could timeout. 

7074 # TODO(mriedem): Calculate provider mappings when we support cross-cell 

7075 # resize/migrate with ports having resource requests. 

7076 self.network_api.migrate_instance_finish( 

7077 ctxt, instance, migration, provider_mappings=None) 

7078 network_info = self.network_api.get_instance_nw_info(ctxt, instance) 

7079 

7080 # If the original vm_state was STOPPED, we do not automatically 

7081 # power on the instance after it is migrated. 

7082 power_on = instance.system_metadata['old_vm_state'] == vm_states.ACTIVE 

7083 try: 

7084 # NOTE(mriedem): If this instance uses a config drive, it will get 

7085 # rebuilt here which means any personality files will be lost, 

7086 # similar to unshelve. If the instance is not using a config drive 

7087 # and getting metadata from the metadata API service, personality 

7088 # files would be lost regardless of the move operation. 

7089 self.driver.spawn( 

7090 ctxt, instance, image_meta, injected_files=[], 

7091 admin_password=None, allocations=allocations, 

7092 network_info=network_info, block_device_info=block_device_info, 

7093 power_on=power_on) 

7094 except Exception: 

7095 with excutils.save_and_reraise_exception(logger=LOG): 

7096 # Rollback port bindings to the source host. 

7097 try: 

7098 # This is gross but migrate_instance_start looks at the 

7099 # migration.dest_compute to determine where to activate the 

7100 # port bindings and we want the source compute port 

7101 # bindings to be re-activated. Remember at this point the 

7102 # instance.host is still pointing at the source compute. 

7103 # TODO(mriedem): Maybe we should be calling 

7104 # setup_instance_network_on_host here to deal with pci 

7105 # devices? 

7106 with utils.temporary_mutation( 

7107 migration, dest_compute=migration.source_compute): 

7108 self.network_api.migrate_instance_start( 

7109 ctxt, instance, migration) 

7110 except Exception: 

7111 LOG.exception( 

7112 'Failed to activate port bindings on the source ' 

7113 'host: %s', migration.source_compute, 

7114 instance=instance) 

7115 

7116 # Rollback volume connections on this host. 

7117 for bdm in bdms: 

7118 if bdm.is_volume: 

7119 try: 

7120 self._remove_volume_connection( 

7121 ctxt, bdm, instance, delete_attachment=True) 

7122 except Exception: 

7123 LOG.exception('Failed to remove volume connection ' 

7124 'on this host %s for volume %s.', 

7125 self.host, bdm.volume_id, 

7126 instance=instance) 

7127 

7128 @wrap_exception() 

7129 @wrap_instance_fault 

7130 def add_fixed_ip_to_instance(self, context, network_id, instance): 

7131 """Calls network_api to add new fixed_ip to instance 

7132 then injects the new network info and resets instance networking. 

7133 

7134 """ 

7135 self._notify_about_instance_usage( 

7136 context, instance, "create_ip.start") 

7137 

7138 network_info = self.network_api.add_fixed_ip_to_instance(context, 

7139 instance, 

7140 network_id) 

7141 self._inject_network_info(instance, network_info) 

7142 

7143 # NOTE(russellb) We just want to bump updated_at. See bug 1143466. 

7144 instance.updated_at = timeutils.utcnow() 

7145 instance.save() 

7146 

7147 self._notify_about_instance_usage( 

7148 context, instance, "create_ip.end", network_info=network_info) 

7149 

7150 @wrap_exception() 

7151 @wrap_instance_fault 

7152 def remove_fixed_ip_from_instance(self, context, address, instance): 

7153 """Calls network_api to remove existing fixed_ip from instance 

7154 by injecting the altered network info and resetting 

7155 instance networking. 

7156 """ 

7157 self._notify_about_instance_usage( 

7158 context, instance, "delete_ip.start") 

7159 

7160 network_info = self.network_api.remove_fixed_ip_from_instance(context, 

7161 instance, 

7162 address) 

7163 self._inject_network_info(instance, network_info) 

7164 

7165 # NOTE(russellb) We just want to bump updated_at. See bug 1143466. 

7166 instance.updated_at = timeutils.utcnow() 

7167 instance.save() 

7168 

7169 self._notify_about_instance_usage( 

7170 context, instance, "delete_ip.end", network_info=network_info) 

7171 

7172 @wrap_exception() 

7173 @reverts_task_state 

7174 @wrap_instance_event(prefix='compute') 

7175 @wrap_instance_fault 

7176 def pause_instance(self, context, instance): 

7177 """Pause an instance on this host.""" 

7178 context = context.elevated() 

7179 LOG.info('Pausing', instance=instance) 

7180 self._notify_about_instance_usage(context, instance, 'pause.start') 

7181 compute_utils.notify_about_instance_action(context, instance, 

7182 self.host, action=fields.NotificationAction.PAUSE, 

7183 phase=fields.NotificationPhase.START) 

7184 self.driver.pause(instance) 

7185 instance.power_state = self._get_power_state(instance) 

7186 instance.vm_state = vm_states.PAUSED 

7187 instance.task_state = None 

7188 instance.save(expected_task_state=task_states.PAUSING) 

7189 self._notify_about_instance_usage(context, instance, 'pause.end') 

7190 compute_utils.notify_about_instance_action(context, instance, 

7191 self.host, action=fields.NotificationAction.PAUSE, 

7192 phase=fields.NotificationPhase.END) 

7193 

7194 @wrap_exception() 

7195 @reverts_task_state 

7196 @wrap_instance_event(prefix='compute') 

7197 @wrap_instance_fault 

7198 def unpause_instance(self, context, instance): 

7199 """Unpause a paused instance on this host.""" 

7200 context = context.elevated() 

7201 LOG.info('Unpausing', instance=instance) 

7202 self._notify_about_instance_usage(context, instance, 'unpause.start') 

7203 compute_utils.notify_about_instance_action(context, instance, 

7204 self.host, action=fields.NotificationAction.UNPAUSE, 

7205 phase=fields.NotificationPhase.START) 

7206 self.driver.unpause(instance) 

7207 instance.power_state = self._get_power_state(instance) 

7208 instance.vm_state = vm_states.ACTIVE 

7209 instance.task_state = None 

7210 instance.save(expected_task_state=task_states.UNPAUSING) 

7211 self._notify_about_instance_usage(context, instance, 'unpause.end') 

7212 compute_utils.notify_about_instance_action(context, instance, 

7213 self.host, action=fields.NotificationAction.UNPAUSE, 

7214 phase=fields.NotificationPhase.END) 

7215 

7216 @wrap_exception() 

7217 def host_power_action(self, context, action): 

7218 """Reboots, shuts down or powers up the host.""" 

7219 return self.driver.host_power_action(action) 

7220 

7221 @wrap_exception() 

7222 def host_maintenance_mode(self, context, host, mode): 

7223 """Start/Stop host maintenance window. On start, it triggers 

7224 guest VMs evacuation. 

7225 """ 

7226 return self.driver.host_maintenance_mode(host, mode) 

7227 

7228 def _update_compute_provider_status(self, context, enabled): 

7229 """Adds or removes the COMPUTE_STATUS_DISABLED trait for this host. 

7230 

7231 For each ComputeNode managed by this service, adds or removes the 

7232 COMPUTE_STATUS_DISABLED traits to/from the associated resource provider 

7233 in Placement. 

7234 

7235 :param context: nova auth RequestContext 

7236 :param enabled: True if the node is enabled in which case the trait 

7237 would be removed, False if the node is disabled in which case 

7238 the trait would be added. 

7239 :raises: ComputeHostNotFound if there are no compute nodes found in 

7240 the ResourceTracker for this service. 

7241 """ 

7242 # Get the compute node(s) on this host. Remember that ironic can be 

7243 # managing more than one compute node. 

7244 nodes = self.rt.compute_nodes.values() 

7245 if not nodes: 

7246 raise exception.ComputeHostNotFound(host=self.host) 

7247 # For each node, we want to add (or remove) the COMPUTE_STATUS_DISABLED 

7248 # trait on the related resource provider in placement so the scheduler 

7249 # (pre-)filters the provider based on its status. 

7250 for node in nodes: 

7251 try: 

7252 self.virtapi.update_compute_provider_status( 

7253 context, node.uuid, enabled) 

7254 except (exception.ResourceProviderTraitRetrievalFailed, 

7255 exception.ResourceProviderUpdateConflict, 

7256 exception.ResourceProviderUpdateFailed, 

7257 exception.TraitRetrievalFailed) as e: 

7258 # This is best effort so just log a warning and continue. 

7259 LOG.warning('An error occurred while updating ' 

7260 'COMPUTE_STATUS_DISABLED trait on compute node ' 

7261 'resource provider %s. The trait will be ' 

7262 'synchronized when the update_available_resource ' 

7263 'periodic task runs. Error: %s', 

7264 node.uuid, e.format_message()) 

7265 except Exception: 

7266 LOG.exception('An error occurred while updating ' 

7267 'COMPUTE_STATUS_DISABLED trait on compute node ' 

7268 'resource provider %s. The trait will be ' 

7269 'synchronized when the ' 

7270 'update_available_resource periodic task runs.', 

7271 node.uuid) 

7272 

7273 @wrap_exception() 

7274 def set_host_enabled(self, context, enabled): 

7275 """Sets the specified host's ability to accept new instances. 

7276 

7277 This method will add or remove the COMPUTE_STATUS_DISABLED trait 

7278 to/from the associated compute node resource provider(s) for this 

7279 compute service. 

7280 """ 

7281 try: 

7282 self._update_compute_provider_status(context, enabled) 

7283 except exception.ComputeHostNotFound: 

7284 LOG.warning('Unable to add/remove trait COMPUTE_STATUS_DISABLED. ' 

7285 'No ComputeNode(s) found for host: %s', self.host) 

7286 

7287 try: 

7288 return self.driver.set_host_enabled(enabled) 

7289 except NotImplementedError: 

7290 # Only the xenapi driver implements set_host_enabled but we don't 

7291 # want NotImplementedError to get raised back to the API. We still 

7292 # need to honor the compute RPC API contract and return 'enabled' 

7293 # or 'disabled' though. 

7294 return 'enabled' if enabled else 'disabled' 

7295 

7296 @wrap_exception() 

7297 def get_host_uptime(self, context): 

7298 """Returns the result of calling "uptime" on the target host.""" 

7299 return self.driver.get_host_uptime() 

7300 

7301 @wrap_exception() 

7302 @wrap_instance_fault 

7303 def get_diagnostics(self, context, instance): 

7304 """Retrieve diagnostics for an instance on this host.""" 

7305 current_power_state = self._get_power_state(instance) 

7306 if current_power_state == power_state.RUNNING: 7306 ↛ 7310line 7306 didn't jump to line 7310 because the condition on line 7306 was always true

7307 LOG.info("Retrieving diagnostics", instance=instance) 

7308 return self.driver.get_diagnostics(instance) 

7309 else: 

7310 raise exception.InstanceInvalidState( 

7311 attr='power state', 

7312 instance_uuid=instance.uuid, 

7313 state=power_state.STATE_MAP[instance.power_state], 

7314 method='get_diagnostics') 

7315 

7316 @wrap_exception() 

7317 @wrap_instance_fault 

7318 def get_instance_diagnostics(self, context, instance): 

7319 """Retrieve diagnostics for an instance on this host.""" 

7320 current_power_state = self._get_power_state(instance) 

7321 if current_power_state == power_state.RUNNING: 7321 ↛ 7325line 7321 didn't jump to line 7325 because the condition on line 7321 was always true

7322 LOG.info("Retrieving diagnostics", instance=instance) 

7323 return self.driver.get_instance_diagnostics(instance) 

7324 else: 

7325 raise exception.InstanceInvalidState( 

7326 attr='power state', 

7327 instance_uuid=instance.uuid, 

7328 state=power_state.STATE_MAP[instance.power_state], 

7329 method='get_diagnostics') 

7330 

7331 @wrap_exception() 

7332 @reverts_task_state 

7333 @wrap_instance_event(prefix='compute') 

7334 @wrap_instance_fault 

7335 def suspend_instance(self, context, instance): 

7336 """Suspend the given instance.""" 

7337 context = context.elevated() 

7338 

7339 # Store the old state 

7340 instance.system_metadata['old_vm_state'] = instance.vm_state 

7341 self._notify_about_instance_usage(context, instance, 'suspend.start') 

7342 compute_utils.notify_about_instance_action(context, instance, 

7343 self.host, action=fields.NotificationAction.SUSPEND, 

7344 phase=fields.NotificationPhase.START) 

7345 with self._error_out_instance_on_exception(context, instance, 

7346 instance_state=instance.vm_state): 

7347 self.driver.suspend(context, instance) 

7348 instance.power_state = self._get_power_state(instance) 

7349 instance.vm_state = vm_states.SUSPENDED 

7350 instance.task_state = None 

7351 instance.save(expected_task_state=task_states.SUSPENDING) 

7352 self._notify_about_instance_usage(context, instance, 'suspend.end') 

7353 compute_utils.notify_about_instance_action(context, instance, 

7354 self.host, action=fields.NotificationAction.SUSPEND, 

7355 phase=fields.NotificationPhase.END) 

7356 

7357 @wrap_exception() 

7358 @reverts_task_state 

7359 @wrap_instance_event(prefix='compute') 

7360 @wrap_instance_fault 

7361 def resume_instance(self, context, instance): 

7362 """Resume the given suspended instance.""" 

7363 context = context.elevated() 

7364 LOG.info('Resuming', instance=instance) 

7365 

7366 self._notify_about_instance_usage(context, instance, 'resume.start') 

7367 

7368 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

7369 context, instance.uuid) 

7370 block_device_info = self._get_instance_block_device_info( 

7371 context, instance, bdms=bdms) 

7372 

7373 # This allows passing share_info to the resume operation for 

7374 # futur usage. However, this scenario is currently not possible 

7375 # because suspending an instance with a share is not permitted 

7376 # by libvirt. As a result, the suspend action involving a share 

7377 # is blocked by the API. 

7378 share_info = self._get_share_info(context, instance) 

7379 

7380 compute_utils.notify_about_instance_action(context, instance, 

7381 self.host, action=fields.NotificationAction.RESUME, 

7382 phase=fields.NotificationPhase.START, bdms=bdms) 

7383 

7384 network_info = self.network_api.get_instance_nw_info(context, instance) 

7385 

7386 with self._error_out_instance_on_exception(context, instance, 

7387 instance_state=instance.vm_state): 

7388 self.driver.resume(context, instance, network_info, 

7389 block_device_info, share_info) 

7390 

7391 instance.power_state = self._get_power_state(instance) 

7392 

7393 # We default to the ACTIVE state for backwards compatibility 

7394 instance.vm_state = instance.system_metadata.pop('old_vm_state', 

7395 vm_states.ACTIVE) 

7396 

7397 instance.task_state = None 

7398 instance.save(expected_task_state=task_states.RESUMING) 

7399 self._notify_about_instance_usage(context, instance, 'resume.end') 

7400 compute_utils.notify_about_instance_action(context, instance, 

7401 self.host, action=fields.NotificationAction.RESUME, 

7402 phase=fields.NotificationPhase.END, bdms=bdms) 

7403 

7404 @wrap_exception() 

7405 @reverts_task_state 

7406 @wrap_instance_event(prefix='compute') 

7407 @wrap_instance_fault 

7408 def shelve_instance(self, context, instance, image_id, 

7409 clean_shutdown, accel_uuids): 

7410 """Shelve an instance. 

7411 

7412 This should be used when you want to take a snapshot of the instance. 

7413 It also adds system_metadata that can be used by a periodic task to 

7414 offload the shelved instance after a period of time. 

7415 

7416 :param context: request context 

7417 :param instance: an Instance object 

7418 :param image_id: an image id to snapshot to. 

7419 :param clean_shutdown: give the GuestOS a chance to stop 

7420 :param accel_uuids: the accelerators uuids for the instance 

7421 """ 

7422 

7423 @utils.synchronized(instance.uuid) 

7424 def do_shelve_instance(): 

7425 self._shelve_instance(context, instance, image_id, clean_shutdown, 

7426 accel_uuids) 

7427 do_shelve_instance() 

7428 

7429 def _shelve_instance(self, context, instance, image_id, 

7430 clean_shutdown, accel_uuids=None): 

7431 LOG.info('Shelving', instance=instance) 

7432 offload = CONF.shelved_offload_time == 0 

7433 if offload: 

7434 # Get the BDMs early so we can pass them into versioned 

7435 # notifications since _shelve_offload_instance needs the 

7436 # BDMs anyway. 

7437 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

7438 context, instance.uuid) 

7439 else: 

7440 bdms = None 

7441 compute_utils.notify_usage_exists(self.notifier, context, instance, 

7442 self.host, current_period=True) 

7443 self._notify_about_instance_usage(context, instance, 'shelve.start') 

7444 compute_utils.notify_about_instance_action(context, instance, 

7445 self.host, action=fields.NotificationAction.SHELVE, 

7446 phase=fields.NotificationPhase.START, bdms=bdms) 

7447 

7448 def update_task_state(task_state, expected_state=task_states.SHELVING): 

7449 shelving_state_map = { 

7450 task_states.IMAGE_PENDING_UPLOAD: 

7451 task_states.SHELVING_IMAGE_PENDING_UPLOAD, 

7452 task_states.IMAGE_UPLOADING: 

7453 task_states.SHELVING_IMAGE_UPLOADING, 

7454 task_states.SHELVING: task_states.SHELVING} 

7455 task_state = shelving_state_map[task_state] 

7456 expected_state = shelving_state_map[expected_state] 

7457 instance.task_state = task_state 

7458 instance.save(expected_task_state=expected_state) 

7459 # Do not attempt a clean shutdown of a paused guest since some 

7460 # hypervisors will fail the clean shutdown if the guest is not 

7461 # running. 

7462 if instance.power_state == power_state.PAUSED: 

7463 clean_shutdown = False 

7464 self._power_off_instance(context, instance, clean_shutdown) 

7465 self.driver.snapshot(context, instance, image_id, update_task_state) 

7466 

7467 instance.system_metadata['shelved_at'] = timeutils.utcnow().isoformat() 

7468 instance.system_metadata['shelved_image_id'] = image_id 

7469 instance.system_metadata['shelved_host'] = self.host 

7470 instance.vm_state = vm_states.SHELVED 

7471 instance.task_state = None 

7472 if offload: 

7473 instance.task_state = task_states.SHELVING_OFFLOADING 

7474 instance.power_state = self._get_power_state(instance) 

7475 instance.save(expected_task_state=[ 

7476 task_states.SHELVING, 

7477 task_states.SHELVING_IMAGE_UPLOADING]) 

7478 

7479 self._notify_about_instance_usage(context, instance, 'shelve.end') 

7480 compute_utils.notify_about_instance_action(context, instance, 

7481 self.host, action=fields.NotificationAction.SHELVE, 

7482 phase=fields.NotificationPhase.END, bdms=bdms) 

7483 

7484 if offload: 

7485 self._shelve_offload_instance( 

7486 context, instance, clean_shutdown=False, bdms=bdms, 

7487 accel_uuids=accel_uuids) 

7488 

7489 @wrap_exception() 

7490 @reverts_task_state 

7491 @wrap_instance_event(prefix='compute') 

7492 @wrap_instance_fault 

7493 def shelve_offload_instance(self, context, instance, clean_shutdown, 

7494 accel_uuids): 

7495 """Remove a shelved instance from the hypervisor. 

7496 

7497 This frees up those resources for use by other instances, but may lead 

7498 to slower unshelve times for this instance. This method is used by 

7499 volume backed instances since restoring them doesn't involve the 

7500 potentially large download of an image. 

7501 

7502 :param context: request context 

7503 :param instance: nova.objects.instance.Instance 

7504 :param clean_shutdown: give the GuestOS a chance to stop 

7505 :param accel_uuids: the accelerators uuids for the instance 

7506 """ 

7507 

7508 @utils.synchronized(instance.uuid) 

7509 def do_shelve_offload_instance(): 

7510 self._shelve_offload_instance(context, instance, clean_shutdown, 

7511 accel_uuids=accel_uuids) 

7512 do_shelve_offload_instance() 

7513 

7514 def _shelve_offload_instance(self, context, instance, clean_shutdown, 

7515 bdms=None, accel_uuids=None): 

7516 LOG.info('Shelve offloading', instance=instance) 

7517 if bdms is None: 

7518 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

7519 context, instance.uuid) 

7520 self._notify_about_instance_usage(context, instance, 

7521 'shelve_offload.start') 

7522 compute_utils.notify_about_instance_action(context, instance, 

7523 self.host, action=fields.NotificationAction.SHELVE_OFFLOAD, 

7524 phase=fields.NotificationPhase.START, bdms=bdms) 

7525 

7526 self._power_off_instance(context, instance, clean_shutdown) 

7527 current_power_state = self._get_power_state(instance) 

7528 network_info = self.network_api.get_instance_nw_info(context, instance) 

7529 

7530 ports_id = [vif['id'] for vif in network_info] 

7531 self.network_api.unbind_ports(context, ports_id, detach=False) 

7532 

7533 block_device_info = self._get_instance_block_device_info(context, 

7534 instance, 

7535 bdms=bdms) 

7536 self.driver.destroy(context, instance, network_info, 

7537 block_device_info) 

7538 

7539 # the instance is going to be removed from the host so we want to 

7540 # terminate all the connections with the volume server and the host 

7541 self._terminate_volume_connections(context, instance, bdms) 

7542 

7543 # NOTE(brinzhang): Free up the accelerator resource occupied 

7544 # in the cyborg service. 

7545 if accel_uuids: 

7546 cyclient = cyborg.get_client(context) 

7547 cyclient.delete_arqs_for_instance(instance.uuid) 

7548 

7549 # Free up the resource allocations in the placement service. 

7550 # This should happen *before* the vm_state is changed to 

7551 # SHELVED_OFFLOADED in case client-side code is polling the API to 

7552 # schedule more instances (or unshelve) once this server is offloaded. 

7553 self.rt.delete_allocation_for_shelve_offloaded_instance(context, 

7554 instance) 

7555 

7556 instance.power_state = current_power_state 

7557 # NOTE(mriedem): The vm_state has to be set before updating the 

7558 # resource tracker, see vm_states.allow_resource_removal(). The 

7559 # host/node values cannot be nulled out until after updating the 

7560 # resource tracker though. 

7561 instance.vm_state = vm_states.SHELVED_OFFLOADED 

7562 instance.task_state = None 

7563 instance.save(expected_task_state=[task_states.SHELVING, 

7564 task_states.SHELVING_OFFLOADING]) 

7565 

7566 # NOTE(ndipanov): Free resources from the resource tracker 

7567 self._update_resource_tracker(context, instance) 

7568 

7569 # NOTE(sfinucan): RPC calls should no longer be attempted against this 

7570 # instance, so ensure any calls result in errors 

7571 self._nil_out_instance_obj_host_and_node(instance) 

7572 instance.save(expected_task_state=None) 

7573 

7574 # TODO(melwitt): We should clean up instance console tokens here. The 

7575 # instance has no host at this point and will need to establish a new 

7576 # console connection in the future after it is unshelved. 

7577 self._delete_scheduler_instance_info(context, instance.uuid) 

7578 self._notify_about_instance_usage(context, instance, 

7579 'shelve_offload.end') 

7580 compute_utils.notify_about_instance_action(context, instance, 

7581 self.host, action=fields.NotificationAction.SHELVE_OFFLOAD, 

7582 phase=fields.NotificationPhase.END, bdms=bdms) 

7583 

7584 @wrap_exception() 

7585 @reverts_task_state 

7586 @wrap_instance_event(prefix='compute') 

7587 @wrap_instance_fault 

7588 def unshelve_instance(self, context, instance, image, 

7589 filter_properties, node, request_spec, accel_uuids): 

7590 """Unshelve the instance. 

7591 

7592 :param context: request context 

7593 :param instance: a nova.objects.instance.Instance object 

7594 :param image: an image to build from. If None we assume a 

7595 volume backed instance. 

7596 :param filter_properties: dict containing limits, retry info etc. 

7597 :param node: target compute node 

7598 :param request_spec: the RequestSpec object used to schedule the 

7599 instance 

7600 :param accel_uuids: the accelerators uuids for the instance 

7601 """ 

7602 if filter_properties is None: 7602 ↛ 7603line 7602 didn't jump to line 7603 because the condition on line 7602 was never true

7603 filter_properties = {} 

7604 

7605 @utils.synchronized(instance.uuid) 

7606 def do_unshelve_instance(): 

7607 self._unshelve_instance( 

7608 context, instance, image, filter_properties, node, 

7609 request_spec, accel_uuids) 

7610 do_unshelve_instance() 

7611 

7612 def _unshelve_instance_key_scrub(self, instance): 

7613 """Remove data from the instance that may cause side effects.""" 

7614 cleaned_keys = dict( 

7615 key_data=instance.key_data, 

7616 auto_disk_config=instance.auto_disk_config) 

7617 instance.key_data = None 

7618 instance.auto_disk_config = False 

7619 return cleaned_keys 

7620 

7621 def _unshelve_instance_key_restore(self, instance, keys): 

7622 """Restore previously scrubbed keys before saving the instance.""" 

7623 instance.update(keys) 

7624 

7625 def _unshelve_instance(self, context, instance, image, filter_properties, 

7626 node, request_spec, accel_uuids): 

7627 LOG.info('Unshelving', instance=instance) 

7628 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

7629 context, instance.uuid) 

7630 

7631 self._notify_about_instance_usage(context, instance, 'unshelve.start') 

7632 compute_utils.notify_about_instance_action(context, instance, 

7633 self.host, action=fields.NotificationAction.UNSHELVE, 

7634 phase=fields.NotificationPhase.START, bdms=bdms) 

7635 

7636 instance.task_state = task_states.SPAWNING 

7637 instance.save() 

7638 

7639 block_device_info = self._prep_block_device(context, instance, bdms) 

7640 scrubbed_keys = self._unshelve_instance_key_scrub(instance) 

7641 

7642 if node is None: 7642 ↛ 7643line 7642 didn't jump to line 7643 because the condition on line 7642 was never true

7643 node = self._get_nodename(instance) 

7644 

7645 limits = filter_properties.get('limits', {}) 

7646 

7647 allocations = self.reportclient.get_allocations_for_consumer( 

7648 context, instance.uuid) 

7649 

7650 shelved_image_ref = instance.image_ref 

7651 if image: 

7652 instance.image_ref = image['id'] 

7653 image_meta = objects.ImageMeta.from_dict(image) 

7654 else: 

7655 image_meta = objects.ImageMeta.from_dict( 

7656 utils.get_image_from_system_metadata( 

7657 instance.system_metadata)) 

7658 

7659 provider_mappings = self._get_request_group_mapping(request_spec) 

7660 

7661 try: 

7662 if provider_mappings: 

7663 compute_utils.update_pci_request_with_placement_allocations( 

7664 context, 

7665 self.reportclient, 

7666 instance.pci_requests.requests, 

7667 provider_mappings, 

7668 ) 

7669 

7670 accel_info = [] 

7671 if accel_uuids: 

7672 try: 

7673 accel_info = self._get_bound_arq_resources( 

7674 context, instance, accel_uuids) 

7675 except (Exception, eventlet.timeout.Timeout) as exc: 

7676 LOG.exception('Failure getting accelerator requests ' 

7677 'with the exception: %s', exc, 

7678 instance=instance) 

7679 self._build_resources_cleanup(instance, None) 

7680 raise 

7681 

7682 with self.rt.instance_claim(context, instance, node, allocations, 

7683 limits): 

7684 self.network_api.setup_instance_network_on_host( 

7685 context, instance, self.host, 

7686 provider_mappings=provider_mappings) 

7687 network_info = self.network_api.get_instance_nw_info( 

7688 context, instance) 

7689 

7690 self.driver.spawn(context, instance, image_meta, 

7691 injected_files=[], 

7692 admin_password=None, 

7693 allocations=allocations, 

7694 network_info=network_info, 

7695 block_device_info=block_device_info, 

7696 accel_info=accel_info) 

7697 except Exception: 

7698 with excutils.save_and_reraise_exception(logger=LOG): 

7699 LOG.exception('Instance failed to spawn', 

7700 instance=instance) 

7701 # Set the image_ref back to initial image_ref because instance 

7702 # object might have been saved with image['id'] 

7703 # https://bugs.launchpad.net/nova/+bug/1934094 

7704 instance.image_ref = shelved_image_ref 

7705 # Cleanup allocations created by the scheduler on this host 

7706 # since we failed to spawn the instance. We do this both if 

7707 # the instance claim failed with ComputeResourcesUnavailable 

7708 # or if we did claim but the spawn failed, because aborting the 

7709 # instance claim will not remove the allocations. 

7710 self.reportclient.delete_allocation_for_instance( 

7711 context, instance.uuid, force=True) 

7712 # FIXME: Umm, shouldn't we be rolling back port bindings too? 

7713 self._terminate_volume_connections(context, instance, bdms) 

7714 # The reverts_task_state decorator on unshelve_instance will 

7715 # eventually save these updates. 

7716 self._nil_out_instance_obj_host_and_node(instance) 

7717 

7718 if image: 

7719 instance.image_ref = shelved_image_ref 

7720 self._delete_snapshot_of_shelved_instance(context, instance, 

7721 image['id']) 

7722 

7723 self._unshelve_instance_key_restore(instance, scrubbed_keys) 

7724 self._update_instance_after_spawn(instance) 

7725 # Delete system_metadata for a shelved instance 

7726 compute_utils.remove_shelved_keys_from_system_metadata(instance) 

7727 

7728 instance.save(expected_task_state=task_states.SPAWNING) 

7729 self._update_scheduler_instance_info(context, instance) 

7730 self._notify_about_instance_usage(context, instance, 'unshelve.end') 

7731 compute_utils.notify_about_instance_action(context, instance, 

7732 self.host, action=fields.NotificationAction.UNSHELVE, 

7733 phase=fields.NotificationPhase.END, bdms=bdms) 

7734 

7735 def _inject_network_info(self, instance, network_info): 

7736 """Inject network info for the given instance.""" 

7737 LOG.debug('Inject network info', instance=instance) 

7738 LOG.debug('network_info to inject: |%s|', network_info, 

7739 instance=instance) 

7740 

7741 self.driver.inject_network_info(instance, network_info) 

7742 

7743 @wrap_instance_fault 

7744 def inject_network_info(self, context, instance): 

7745 """Inject network info, but don't return the info.""" 

7746 network_info = self.network_api.get_instance_nw_info(context, instance) 

7747 self._inject_network_info(instance, network_info) 

7748 

7749 @messaging.expected_exceptions(NotImplementedError, 

7750 exception.ConsoleNotAvailable, 

7751 exception.InstanceNotFound) 

7752 @wrap_exception() 

7753 @wrap_instance_fault 

7754 def get_console_output(self, context, instance, tail_length): 

7755 """Send the console output for the given instance.""" 

7756 context = context.elevated() 

7757 LOG.info("Get console output", instance=instance) 

7758 output = self.driver.get_console_output(context, instance) 

7759 

7760 if type(output) is str: 7760 ↛ 7763line 7760 didn't jump to line 7763 because the condition on line 7760 was always true

7761 output = output.encode("latin-1") 

7762 

7763 if tail_length is not None: 

7764 output = self._tail_log(output, tail_length) 

7765 

7766 return output.decode('ascii', 'replace') 

7767 

7768 def _tail_log(self, log, length): 

7769 try: 

7770 length = int(length) 

7771 except ValueError: 

7772 length = 0 

7773 

7774 if length == 0: 7774 ↛ 7775line 7774 didn't jump to line 7775 because the condition on line 7774 was never true

7775 return b'' 

7776 else: 

7777 return b'\n'.join(log.split(b'\n')[-int(length):]) 

7778 

7779 @messaging.expected_exceptions(exception.ConsoleTypeInvalid, 

7780 exception.InstanceNotReady, 

7781 exception.InstanceNotFound, 

7782 exception.ConsoleTypeUnavailable, 

7783 NotImplementedError) 

7784 @wrap_exception() 

7785 @wrap_instance_fault 

7786 def get_vnc_console(self, context, console_type, instance): 

7787 """Return connection information for a vnc console.""" 

7788 context = context.elevated() 

7789 LOG.debug("Getting vnc console", instance=instance) 

7790 

7791 if not CONF.vnc.enabled: 

7792 raise exception.ConsoleTypeUnavailable(console_type=console_type) 

7793 

7794 if console_type == 'novnc': 

7795 # For essex, novncproxy_base_url must include the full path 

7796 # including the html file (like http://myhost/vnc_auto.html) 

7797 access_url_base = CONF.vnc.novncproxy_base_url 

7798 else: 

7799 raise exception.ConsoleTypeInvalid(console_type=console_type) 

7800 

7801 try: 

7802 # Retrieve connect info from driver, and then decorate with our 

7803 # access info token 

7804 console = self.driver.get_vnc_console(context, instance) 

7805 console_auth = objects.ConsoleAuthToken( 

7806 context=context, 

7807 console_type=console_type, 

7808 host=console.host, 

7809 port=console.port, 

7810 internal_access_path=console.internal_access_path, 

7811 instance_uuid=instance.uuid, 

7812 access_url_base=access_url_base, 

7813 ) 

7814 console_auth.authorize(CONF.consoleauth.token_ttl) 

7815 connect_info = console.get_connection_info( 

7816 console_auth.token, console_auth.access_url) 

7817 

7818 except exception.InstanceNotFound: 

7819 if instance.vm_state != vm_states.BUILDING: 7819 ↛ 7820line 7819 didn't jump to line 7820 because the condition on line 7819 was never true

7820 raise 

7821 raise exception.InstanceNotReady(instance_id=instance.uuid) 

7822 

7823 return connect_info 

7824 

7825 @messaging.expected_exceptions(exception.ConsoleTypeInvalid, 

7826 exception.InstanceNotReady, 

7827 exception.InstanceNotFound, 

7828 exception.ConsoleTypeUnavailable, 

7829 NotImplementedError) 

7830 @wrap_exception() 

7831 @wrap_instance_fault 

7832 def get_spice_console(self, context, console_type, instance): 

7833 """Return connection information for a spice console.""" 

7834 context = context.elevated() 

7835 LOG.debug("Getting spice console", instance=instance) 

7836 

7837 if not CONF.spice.enabled: 

7838 raise exception.ConsoleTypeUnavailable(console_type=console_type) 

7839 

7840 if console_type not in ['spice-html5', 'spice-direct']: 

7841 raise exception.ConsoleTypeInvalid(console_type=console_type) 

7842 

7843 try: 

7844 # Retrieve connect info from driver, and then decorate with our 

7845 # access info token 

7846 console = self.driver.get_spice_console(context, instance) 

7847 fields = { 

7848 'context': context, 

7849 'console_type': console_type, 

7850 'host': console.host, 

7851 'port': console.port, 

7852 'tls_port': console.tlsPort, 

7853 'instance_uuid': instance.uuid 

7854 } 

7855 if console_type == 'spice-html5': 7855 ↛ 7858line 7855 didn't jump to line 7858 because the condition on line 7855 was always true

7856 fields['internal_access_path'] = console.internal_access_path 

7857 fields['access_url_base'] = CONF.spice.html5proxy_base_url 

7858 if console_type == 'spice-direct': 7858 ↛ 7859line 7858 didn't jump to line 7859 because the condition on line 7858 was never true

7859 fields['internal_access_path'] = None 

7860 fields['access_url_base'] = \ 

7861 CONF.spice.spice_direct_proxy_base_url 

7862 

7863 console_auth = objects.ConsoleAuthToken(**fields) 

7864 console_auth.authorize(CONF.consoleauth.token_ttl) 

7865 connect_info = console.get_connection_info( 

7866 console_auth.token, console_auth.access_url) 

7867 

7868 except exception.InstanceNotFound: 

7869 if instance.vm_state != vm_states.BUILDING: 7869 ↛ 7870line 7869 didn't jump to line 7870 because the condition on line 7869 was never true

7870 raise 

7871 raise exception.InstanceNotReady(instance_id=instance.uuid) 

7872 

7873 return connect_info 

7874 

7875 # TODO(gmann): HyperV virt driver has been removed in Nova 29.0.0 

7876 # but we need to keep this method to avoid RPC error in case of using 

7877 # old controller with new compute. This can be removed in RPC API 7.0 

7878 @messaging.expected_exceptions(exception.ConsoleTypeInvalid, 

7879 exception.InstanceNotReady, 

7880 exception.InstanceNotFound, 

7881 exception.ConsoleTypeUnavailable, 

7882 NotImplementedError) 

7883 @wrap_exception() 

7884 @wrap_instance_fault 

7885 def get_rdp_console(self, context, console_type, instance): 

7886 """Return connection information for a RDP console.""" 

7887 

7888 msg = ("RDP console is applicable for HyperV virt driver only which " 

7889 "has been removed in Nova 29.0.0") 

7890 raise NotImplementedError(msg) 

7891 

7892 @messaging.expected_exceptions(exception.ConsoleTypeInvalid, 

7893 exception.InstanceNotReady, 

7894 exception.InstanceNotFound, 

7895 exception.ConsoleTypeUnavailable, 

7896 NotImplementedError) 

7897 @wrap_exception() 

7898 @wrap_instance_fault 

7899 def get_mks_console(self, context, console_type, instance): 

7900 """Return connection information for a MKS console.""" 

7901 context = context.elevated() 

7902 LOG.debug("Getting MKS console", instance=instance) 

7903 

7904 if not CONF.mks.enabled: 7904 ↛ 7905line 7904 didn't jump to line 7905 because the condition on line 7904 was never true

7905 raise exception.ConsoleTypeUnavailable(console_type=console_type) 

7906 

7907 if console_type != 'webmks': 7907 ↛ 7908line 7907 didn't jump to line 7908 because the condition on line 7907 was never true

7908 raise exception.ConsoleTypeInvalid(console_type=console_type) 

7909 

7910 try: 

7911 # Retrieve connect info from driver, and then decorate with our 

7912 # access info token 

7913 console = self.driver.get_mks_console(context, instance) 

7914 console_auth = objects.ConsoleAuthToken( 

7915 context=context, 

7916 console_type=console_type, 

7917 host=console.host, 

7918 port=console.port, 

7919 internal_access_path=console.internal_access_path, 

7920 instance_uuid=instance.uuid, 

7921 access_url_base=CONF.mks.mksproxy_base_url, 

7922 ) 

7923 console_auth.authorize(CONF.consoleauth.token_ttl) 

7924 connect_info = console.get_connection_info( 

7925 console_auth.token, console_auth.access_url) 

7926 

7927 except exception.InstanceNotFound: 

7928 if instance.vm_state != vm_states.BUILDING: 

7929 raise 

7930 raise exception.InstanceNotReady(instance_id=instance.uuid) 

7931 

7932 return connect_info 

7933 

7934 @messaging.expected_exceptions( 

7935 exception.ConsoleTypeInvalid, 

7936 exception.InstanceNotReady, 

7937 exception.InstanceNotFound, 

7938 exception.ConsoleTypeUnavailable, 

7939 exception.SocketPortRangeExhaustedException, 

7940 exception.ImageSerialPortNumberInvalid, 

7941 exception.ImageSerialPortNumberExceedFlavorValue, 

7942 NotImplementedError) 

7943 @wrap_exception() 

7944 @wrap_instance_fault 

7945 def get_serial_console(self, context, console_type, instance): 

7946 """Returns connection information for a serial console.""" 

7947 

7948 LOG.debug("Getting serial console", instance=instance) 

7949 

7950 if not CONF.serial_console.enabled: 7950 ↛ 7951line 7950 didn't jump to line 7951 because the condition on line 7950 was never true

7951 raise exception.ConsoleTypeUnavailable(console_type=console_type) 

7952 

7953 context = context.elevated() 

7954 

7955 try: 

7956 # Retrieve connect info from driver, and then decorate with our 

7957 # access info token 

7958 console = self.driver.get_serial_console(context, instance) 

7959 console_auth = objects.ConsoleAuthToken( 

7960 context=context, 

7961 console_type=console_type, 

7962 host=console.host, 

7963 port=console.port, 

7964 internal_access_path=console.internal_access_path, 

7965 instance_uuid=instance.uuid, 

7966 access_url_base=CONF.serial_console.base_url, 

7967 ) 

7968 console_auth.authorize(CONF.consoleauth.token_ttl) 

7969 connect_info = console.get_connection_info( 

7970 console_auth.token, console_auth.access_url) 

7971 

7972 except exception.InstanceNotFound: 

7973 if instance.vm_state != vm_states.BUILDING: 

7974 raise 

7975 raise exception.InstanceNotReady(instance_id=instance.uuid) 

7976 

7977 return connect_info 

7978 

7979 @messaging.expected_exceptions(exception.ConsoleTypeInvalid, 

7980 exception.InstanceNotReady, 

7981 exception.InstanceNotFound) 

7982 @wrap_exception() 

7983 @wrap_instance_fault 

7984 def validate_console_port(self, ctxt, instance, port, console_type): 

7985 if console_type in ["spice-html5", "spice-direct"]: 

7986 console_info = self.driver.get_spice_console(ctxt, instance) 

7987 elif console_type == "serial": 

7988 console_info = self.driver.get_serial_console(ctxt, instance) 

7989 elif console_type == "webmks": 

7990 console_info = self.driver.get_mks_console(ctxt, instance) 

7991 else: 

7992 console_info = self.driver.get_vnc_console(ctxt, instance) 

7993 

7994 # Some drivers may return an int on console_info.port but the port 

7995 # variable in this method is a string, so cast to be sure we are 

7996 # comparing the correct types. 

7997 return str(console_info.port) == port 

7998 

7999 @wrap_exception() 

8000 @reverts_task_state 

8001 @wrap_instance_fault 

8002 def reserve_block_device_name(self, context, instance, device, 

8003 volume_id, disk_bus, device_type, tag, 

8004 multiattach): 

8005 if (tag and not 

8006 self.driver.capabilities.get('supports_tagged_attach_volume', 

8007 False)): 

8008 raise exception.VolumeTaggedAttachNotSupported() 

8009 

8010 if (multiattach and not 

8011 self.driver.capabilities.get('supports_multiattach', False)): 

8012 raise exception.MultiattachNotSupportedByVirtDriver( 

8013 volume_id=volume_id) 

8014 

8015 @utils.synchronized(instance.uuid) 

8016 def do_reserve(): 

8017 bdms = ( 

8018 objects.BlockDeviceMappingList.get_by_instance_uuid( 

8019 context, instance.uuid)) 

8020 

8021 # Now that we have the lock check that we haven't raced another 

8022 # request and ensure there is no existing attachment 

8023 if any(b for b in bdms if b.volume_id == volume_id): 

8024 msg = _("volume %s already attached") % volume_id 

8025 raise exception.InvalidVolume(reason=msg) 

8026 

8027 # NOTE(ndipanov): We need to explicitly set all the fields on the 

8028 # object so that obj_load_attr does not fail 

8029 new_bdm = objects.BlockDeviceMapping( 

8030 context=context, 

8031 source_type='volume', destination_type='volume', 

8032 instance_uuid=instance.uuid, boot_index=None, 

8033 volume_id=volume_id, 

8034 device_name=device, guest_format=None, 

8035 disk_bus=disk_bus, device_type=device_type, tag=tag) 

8036 

8037 new_bdm.device_name = self._get_device_name_for_instance( 

8038 instance, bdms, new_bdm) 

8039 

8040 # NOTE(vish): create bdm here to avoid race condition 

8041 new_bdm.create() 

8042 return new_bdm 

8043 

8044 return do_reserve() 

8045 

8046 @wrap_exception() 

8047 @wrap_instance_event(prefix='compute') 

8048 @wrap_instance_fault 

8049 def attach_volume(self, context, instance, bdm): 

8050 """Attach a volume to an instance.""" 

8051 driver_bdm = driver_block_device.convert_volume(bdm) 

8052 

8053 @utils.synchronized(instance.uuid) 

8054 def do_attach_volume(context, instance, driver_bdm): 

8055 try: 

8056 return self._attach_volume(context, instance, driver_bdm) 

8057 except Exception: 

8058 with excutils.save_and_reraise_exception(): 

8059 bdm.destroy() 

8060 

8061 do_attach_volume(context, instance, driver_bdm) 

8062 

8063 def _attach_volume(self, context, instance, bdm): 

8064 context = context.elevated() 

8065 LOG.info('Attaching volume %(volume_id)s to %(mountpoint)s', 

8066 {'volume_id': bdm.volume_id, 

8067 'mountpoint': bdm['mount_device']}, 

8068 instance=instance) 

8069 compute_utils.notify_about_volume_attach_detach( 

8070 context, instance, self.host, 

8071 action=fields.NotificationAction.VOLUME_ATTACH, 

8072 phase=fields.NotificationPhase.START, 

8073 volume_id=bdm.volume_id) 

8074 try: 

8075 bdm.attach(context, instance, self.volume_api, self.driver, 

8076 do_driver_attach=True) 

8077 except Exception as e: 

8078 with excutils.save_and_reraise_exception(): 

8079 LOG.exception("Failed to attach %(volume_id)s " 

8080 "at %(mountpoint)s", 

8081 {'volume_id': bdm.volume_id, 

8082 'mountpoint': bdm['mount_device']}, 

8083 instance=instance) 

8084 if bdm['attachment_id']: 

8085 # Try to delete the attachment to make the volume 

8086 # available again. Note that DriverVolumeBlockDevice 

8087 # may have already deleted the attachment so ignore 

8088 # VolumeAttachmentNotFound. 

8089 try: 

8090 self.volume_api.attachment_delete( 

8091 context, bdm['attachment_id']) 

8092 except exception.VolumeAttachmentNotFound as exc: 

8093 LOG.debug('Ignoring VolumeAttachmentNotFound: %s', 

8094 exc, instance=instance) 

8095 else: 

8096 self.volume_api.unreserve_volume(context, bdm.volume_id) 

8097 compute_utils.notify_about_volume_attach_detach( 

8098 context, instance, self.host, 

8099 action=fields.NotificationAction.VOLUME_ATTACH, 

8100 phase=fields.NotificationPhase.ERROR, 

8101 exception=e, 

8102 volume_id=bdm.volume_id) 

8103 

8104 info = {'volume_id': bdm.volume_id} 

8105 self._notify_about_instance_usage( 

8106 context, instance, "volume.attach", extra_usage_info=info) 

8107 compute_utils.notify_about_volume_attach_detach( 

8108 context, instance, self.host, 

8109 action=fields.NotificationAction.VOLUME_ATTACH, 

8110 phase=fields.NotificationPhase.END, 

8111 volume_id=bdm.volume_id) 

8112 

8113 def _notify_volume_usage_detach(self, context, instance, bdm): 

8114 if CONF.volume_usage_poll_interval <= 0: 

8115 return 

8116 

8117 mp = bdm.device_name 

8118 # Handle bootable volumes which will not contain /dev/ 

8119 if '/dev/' in mp: 8119 ↛ 8121line 8119 didn't jump to line 8121 because the condition on line 8119 was always true

8120 mp = mp[5:] 

8121 try: 

8122 vol_stats = self.driver.block_stats(instance, mp) 

8123 if vol_stats is None: 

8124 return 

8125 except NotImplementedError: 

8126 return 

8127 

8128 LOG.debug("Updating volume usage cache with totals", instance=instance) 

8129 rd_req, rd_bytes, wr_req, wr_bytes, flush_ops = vol_stats 

8130 vol_usage = objects.VolumeUsage(context) 

8131 vol_usage.volume_id = bdm.volume_id 

8132 vol_usage.instance_uuid = instance.uuid 

8133 vol_usage.project_id = instance.project_id 

8134 vol_usage.user_id = instance.user_id 

8135 vol_usage.availability_zone = instance.availability_zone 

8136 vol_usage.curr_reads = rd_req 

8137 vol_usage.curr_read_bytes = rd_bytes 

8138 vol_usage.curr_writes = wr_req 

8139 vol_usage.curr_write_bytes = wr_bytes 

8140 vol_usage.save(update_totals=True) 

8141 self.notifier.info(context, 'volume.usage', vol_usage.to_dict()) 

8142 compute_utils.notify_about_volume_usage(context, vol_usage, self.host) 

8143 

8144 def _detach_volume(self, context, bdm, instance, destroy_bdm=True, 

8145 attachment_id=None): 

8146 """Detach a volume from an instance. 

8147 

8148 :param context: security context 

8149 :param bdm: nova.objects.BlockDeviceMapping volume bdm to detach 

8150 :param instance: the Instance object to detach the volume from 

8151 :param destroy_bdm: if True, the corresponding BDM entry will be marked 

8152 as deleted. Disabling this is useful for operations 

8153 like rebuild, when we don't want to destroy BDM 

8154 :param attachment_id: The volume attachment_id for the given instance 

8155 and volume. 

8156 """ 

8157 volume_id = bdm.volume_id 

8158 compute_utils.notify_about_volume_attach_detach( 

8159 context, instance, self.host, 

8160 action=fields.NotificationAction.VOLUME_DETACH, 

8161 phase=fields.NotificationPhase.START, 

8162 volume_id=volume_id) 

8163 

8164 self._notify_volume_usage_detach(context, instance, bdm) 

8165 

8166 LOG.info('Detaching volume %(volume_id)s', 

8167 {'volume_id': volume_id}, instance=instance) 

8168 

8169 driver_bdm = driver_block_device.convert_volume(bdm) 

8170 driver_bdm.detach(context, instance, self.volume_api, self.driver, 

8171 attachment_id=attachment_id, destroy_bdm=destroy_bdm) 

8172 

8173 info = dict(volume_id=volume_id) 

8174 self._notify_about_instance_usage( 

8175 context, instance, "volume.detach", extra_usage_info=info) 

8176 compute_utils.notify_about_volume_attach_detach( 

8177 context, instance, self.host, 

8178 action=fields.NotificationAction.VOLUME_DETACH, 

8179 phase=fields.NotificationPhase.END, 

8180 volume_id=volume_id) 

8181 

8182 if 'tag' in bdm and bdm.tag: 

8183 self._delete_disk_metadata(instance, bdm) 

8184 if destroy_bdm: 

8185 bdm.destroy() 

8186 

8187 def _delete_disk_metadata(self, instance, bdm): 

8188 for device in instance.device_metadata.devices: 

8189 if isinstance(device, objects.DiskMetadata): 8189 ↛ 8188line 8189 didn't jump to line 8188 because the condition on line 8189 was always true

8190 if 'serial' in device: 

8191 if device.serial == bdm.volume_id: 8191 ↛ 8188line 8191 didn't jump to line 8188 because the condition on line 8191 was always true

8192 instance.device_metadata.devices.remove(device) 

8193 instance.save() 

8194 break 

8195 else: 

8196 # NOTE(artom) We log the entire device object because all 

8197 # fields are nullable and may not be set 

8198 LOG.warning('Unable to determine whether to clean up ' 

8199 'device metadata for disk %s', device, 

8200 instance=instance) 

8201 

8202 @wrap_exception() 

8203 @wrap_instance_event(prefix='compute') 

8204 @wrap_instance_fault 

8205 def detach_volume(self, context, volume_id, instance, attachment_id): 

8206 """Detach a volume from an instance. 

8207 

8208 :param context: security context 

8209 :param volume_id: the volume id 

8210 :param instance: the Instance object to detach the volume from 

8211 :param attachment_id: The volume attachment_id for the given instance 

8212 and volume. 

8213 

8214 """ 

8215 @utils.synchronized(instance.uuid) 

8216 def do_detach_volume(context, volume_id, instance, attachment_id): 

8217 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance( 

8218 context, volume_id, instance.uuid) 

8219 self._detach_volume(context, bdm, instance, 

8220 attachment_id=attachment_id) 

8221 

8222 do_detach_volume(context, volume_id, instance, attachment_id) 

8223 

8224 def _init_volume_connection(self, context, new_volume, 

8225 old_volume_id, connector, bdm, 

8226 new_attachment_id, mountpoint): 

8227 new_volume_id = new_volume['id'] 

8228 if new_attachment_id is None: 

8229 # We're dealing with an old-style attachment so initialize the 

8230 # connection so we can get the connection_info. 

8231 new_cinfo = self.volume_api.initialize_connection(context, 

8232 new_volume_id, 

8233 connector) 

8234 else: 

8235 # Check for multiattach on the new volume and if True, check to 

8236 # see if the virt driver supports multiattach. 

8237 # TODO(mriedem): This is copied from DriverVolumeBlockDevice 

8238 # and should be consolidated into some common code at some point. 

8239 vol_multiattach = new_volume.get('multiattach', False) 

8240 virt_multiattach = self.driver.capabilities.get( 

8241 'supports_multiattach', False) 

8242 if vol_multiattach and not virt_multiattach: 

8243 raise exception.MultiattachNotSupportedByVirtDriver( 

8244 volume_id=new_volume_id) 

8245 

8246 # This is a new style attachment and the API created the new 

8247 # volume attachment and passed the id to the compute over RPC. 

8248 # At this point we need to update the new volume attachment with 

8249 # the host connector, which will give us back the new attachment 

8250 # connection_info. 

8251 new_cinfo = self.volume_api.attachment_update( 

8252 context, new_attachment_id, connector, 

8253 mountpoint)['connection_info'] 

8254 

8255 if vol_multiattach: 

8256 # This will be used by the volume driver to determine the 

8257 # proper disk configuration. 

8258 new_cinfo['multiattach'] = True 

8259 

8260 old_cinfo = jsonutils.loads(bdm['connection_info']) 

8261 if old_cinfo and 'serial' not in old_cinfo: 8261 ↛ 8266line 8261 didn't jump to line 8266 because the condition on line 8261 was always true

8262 old_cinfo['serial'] = old_volume_id 

8263 # NOTE(lyarwood): serial is not always present in the returned 

8264 # connection_info so set it if it is missing as we do in 

8265 # DriverVolumeBlockDevice.attach(). 

8266 if 'serial' not in new_cinfo: 8266 ↛ 8268line 8266 didn't jump to line 8268 because the condition on line 8266 was always true

8267 new_cinfo['serial'] = new_volume_id 

8268 return (old_cinfo, new_cinfo) 

8269 

8270 def _swap_volume(self, context, instance, bdm, connector, 

8271 old_volume_id, new_volume, resize_to, 

8272 new_attachment_id, is_cinder_migration): 

8273 new_volume_id = new_volume['id'] 

8274 mountpoint = bdm['device_name'] 

8275 failed = False 

8276 new_cinfo = None 

8277 try: 

8278 old_cinfo, new_cinfo = self._init_volume_connection( 

8279 context, new_volume, old_volume_id, connector, 

8280 bdm, new_attachment_id, mountpoint) 

8281 # NOTE(lyarwood): The Libvirt driver, the only virt driver 

8282 # currently implementing swap_volume, will modify the contents of 

8283 # new_cinfo when connect_volume is called. This is then saved to 

8284 # the BDM in swap_volume for future use outside of this flow. 

8285 msg = ("swap_volume: Calling driver volume swap with " 

8286 "connection infos: new: %(new_cinfo)s; " 

8287 "old: %(old_cinfo)s" % 

8288 {'new_cinfo': new_cinfo, 'old_cinfo': old_cinfo}) 

8289 # Both new and old info might contain password 

8290 LOG.debug(strutils.mask_password(msg), instance=instance) 

8291 

8292 self.driver.swap_volume(context, old_cinfo, new_cinfo, instance, 

8293 mountpoint, resize_to) 

8294 if new_attachment_id: 

8295 self.volume_api.attachment_complete(context, new_attachment_id) 

8296 msg = ("swap_volume: Driver volume swap returned, new " 

8297 "connection_info is now : %(new_cinfo)s" % 

8298 {'new_cinfo': new_cinfo}) 

8299 LOG.debug(strutils.mask_password(msg)) 

8300 except Exception as ex: 

8301 failed = True 

8302 with excutils.save_and_reraise_exception(): 

8303 compute_utils.notify_about_volume_swap( 

8304 context, instance, self.host, 

8305 fields.NotificationPhase.ERROR, 

8306 old_volume_id, new_volume_id, ex) 

8307 if new_cinfo: 

8308 msg = ("Failed to swap volume %(old_volume_id)s " 

8309 "for %(new_volume_id)s") 

8310 LOG.exception(msg, {'old_volume_id': old_volume_id, 

8311 'new_volume_id': new_volume_id}, 

8312 instance=instance) 

8313 else: 

8314 msg = ("Failed to connect to volume %(volume_id)s " 

8315 "with volume at %(mountpoint)s") 

8316 LOG.exception(msg, {'volume_id': new_volume_id, 

8317 'mountpoint': bdm['device_name']}, 

8318 instance=instance) 

8319 

8320 # The API marked the volume as 'detaching' for the old volume 

8321 # so we need to roll that back so the volume goes back to 

8322 # 'in-use' state. 

8323 self.volume_api.roll_detaching(context, old_volume_id) 

8324 

8325 if new_attachment_id is None: 

8326 # The API reserved the new volume so it would be in 

8327 # 'attaching' status, so we need to unreserve it so it 

8328 # goes back to 'available' status. 

8329 self.volume_api.unreserve_volume(context, new_volume_id) 

8330 else: 

8331 # This is a new style attachment for the new volume, which 

8332 # was created in the API. We just need to delete it here 

8333 # to put the new volume back into 'available' status. 

8334 self.volume_api.attachment_delete( 

8335 context, new_attachment_id) 

8336 finally: 

8337 # TODO(mriedem): This finally block is terribly confusing and is 

8338 # trying to do too much. We should consider removing the finally 

8339 # block and move whatever needs to happen on success and failure 

8340 # into the blocks above for clarity, even if it means a bit of 

8341 # redundant code. 

8342 conn_volume = new_volume_id if failed else old_volume_id 

8343 if new_cinfo: 

8344 LOG.debug("swap_volume: removing Cinder connection " 

8345 "for volume %(volume)s", {'volume': conn_volume}, 

8346 instance=instance) 

8347 if bdm.attachment_id is None: 

8348 # This is the pre-3.44 flow for new-style volume 

8349 # attachments so just terminate the connection. 

8350 self.volume_api.terminate_connection(context, 

8351 conn_volume, 

8352 connector) 

8353 else: 

8354 # This is a new style volume attachment. If we failed, then 

8355 # the new attachment was already deleted above in the 

8356 # exception block and we have nothing more to do here. If 

8357 # swap_volume was successful in the driver, then we need to 

8358 # "detach" the original attachment by deleting it. 

8359 if not failed: 

8360 self.volume_api.attachment_delete( 

8361 context, bdm.attachment_id) 

8362 

8363 # Need to make some decisions based on whether this was 

8364 # a Cinder initiated migration or not. The callback to 

8365 # migration completion isn't needed in the case of a 

8366 # nova initiated simple swap of two volume 

8367 # "volume-update" call so skip that. The new attachment 

8368 # scenarios will give us a new attachment record and 

8369 # that's what we want. 

8370 if bdm.attachment_id and not is_cinder_migration: 

8371 # we don't callback to cinder 

8372 comp_ret = {'save_volume_id': new_volume_id} 

8373 else: 

8374 # NOTE(lyarwood): The following call to 

8375 # os-migrate-volume-completion returns a dict containing 

8376 # save_volume_id, this volume id has two possible values : 

8377 # 1. old_volume_id if we are migrating (retyping) volumes 

8378 # 2. new_volume_id if we are swapping between two existing 

8379 # volumes 

8380 # This volume id is later used to update the volume_id and 

8381 # connection_info['serial'] of the BDM. 

8382 comp_ret = self.volume_api.migrate_volume_completion( 

8383 context, 

8384 old_volume_id, 

8385 new_volume_id, 

8386 error=failed) 

8387 LOG.debug("swap_volume: Cinder migrate_volume_completion " 

8388 "returned: %(comp_ret)s", {'comp_ret': comp_ret}, 

8389 instance=instance) 

8390 

8391 return (comp_ret, new_cinfo) 

8392 

8393 @wrap_exception() 

8394 @wrap_instance_event(prefix='compute') 

8395 @wrap_instance_fault 

8396 def swap_volume(self, context, old_volume_id, new_volume_id, instance, 

8397 new_attachment_id): 

8398 """Replace the old volume with the new volume within the active server 

8399 

8400 :param context: User request context 

8401 :param old_volume_id: Original volume id 

8402 :param new_volume_id: New volume id being swapped to 

8403 :param instance: Instance with original_volume_id attached 

8404 :param new_attachment_id: ID of the new attachment for new_volume_id 

8405 """ 

8406 @utils.synchronized(instance.uuid) 

8407 def _do_locked_swap_volume(context, old_volume_id, new_volume_id, 

8408 instance, new_attachment_id): 

8409 self._do_swap_volume(context, old_volume_id, new_volume_id, 

8410 instance, new_attachment_id) 

8411 _do_locked_swap_volume(context, old_volume_id, new_volume_id, instance, 

8412 new_attachment_id) 

8413 

8414 def _do_swap_volume(self, context, old_volume_id, new_volume_id, 

8415 instance, new_attachment_id): 

8416 """Replace the old volume with the new volume within the active server 

8417 

8418 :param context: User request context 

8419 :param old_volume_id: Original volume id 

8420 :param new_volume_id: New volume id being swapped to 

8421 :param instance: Instance with original_volume_id attached 

8422 :param new_attachment_id: ID of the new attachment for new_volume_id 

8423 """ 

8424 context = context.elevated() 

8425 compute_utils.notify_about_volume_swap( 

8426 context, instance, self.host, 

8427 fields.NotificationPhase.START, 

8428 old_volume_id, new_volume_id) 

8429 

8430 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance( 

8431 context, old_volume_id, instance.uuid) 

8432 connector = self.driver.get_volume_connector(instance) 

8433 

8434 resize_to = 0 

8435 old_volume = self.volume_api.get(context, old_volume_id) 

8436 # Yes this is a tightly-coupled state check of what's going on inside 

8437 # cinder, but we need this while we still support old (v1/v2) and 

8438 # new style attachments (v3.44). Once we drop support for old style 

8439 # attachments we could think about cleaning up the cinder-initiated 

8440 # swap volume API flows. 

8441 is_cinder_migration = False 

8442 if 'migration_status' in old_volume: 

8443 is_cinder_migration = old_volume['migration_status'] == 'migrating' 

8444 old_vol_size = old_volume['size'] 

8445 new_volume = self.volume_api.get(context, new_volume_id) 

8446 new_vol_size = new_volume['size'] 

8447 if new_vol_size > old_vol_size: 

8448 resize_to = new_vol_size 

8449 

8450 LOG.info('Swapping volume %(old_volume)s for %(new_volume)s', 

8451 {'old_volume': old_volume_id, 'new_volume': new_volume_id}, 

8452 instance=instance) 

8453 comp_ret, new_cinfo = self._swap_volume(context, 

8454 instance, 

8455 bdm, 

8456 connector, 

8457 old_volume_id, 

8458 new_volume, 

8459 resize_to, 

8460 new_attachment_id, 

8461 is_cinder_migration) 

8462 

8463 # NOTE(lyarwood): Update the BDM with the modified new_cinfo and 

8464 # correct volume_id returned by Cinder. 

8465 save_volume_id = comp_ret['save_volume_id'] 

8466 

8467 # NOTE(lyarwood): Overwrite the possibly stale serial and volume_id in 

8468 # the connection_info with the volume_id returned from Cinder. This 

8469 # could be the case during a volume migration where the new_cinfo here 

8470 # refers to the temporary volume *before* Cinder renames it to the 

8471 # original volume UUID at the end of the migration. 

8472 new_cinfo['serial'] = save_volume_id 

8473 new_cinfo['volume_id'] = save_volume_id 

8474 if 'data' in new_cinfo: 

8475 new_cinfo['data']['volume_id'] = save_volume_id 

8476 values = { 

8477 'connection_info': jsonutils.dumps(new_cinfo), 

8478 'source_type': 'volume', 

8479 'destination_type': 'volume', 

8480 'snapshot_id': None, 

8481 'volume_id': save_volume_id, 

8482 'no_device': None} 

8483 

8484 if resize_to: 

8485 values['volume_size'] = resize_to 

8486 

8487 if new_attachment_id is not None: 

8488 # This was a volume swap for a new-style attachment so we 

8489 # need to update the BDM attachment_id for the new attachment. 

8490 values['attachment_id'] = new_attachment_id 

8491 

8492 LOG.debug("swap_volume: Updating volume %(volume_id)s BDM record with " 

8493 "%(updates)s", {'volume_id': bdm.volume_id, 

8494 'updates': values}, 

8495 instance=instance) 

8496 bdm.update(values) 

8497 bdm.save() 

8498 

8499 compute_utils.notify_about_volume_swap( 

8500 context, instance, self.host, 

8501 fields.NotificationPhase.END, 

8502 old_volume_id, new_volume_id) 

8503 

8504 @wrap_exception() 

8505 def remove_volume_connection( 

8506 self, context, volume_id, instance, 

8507 delete_attachment=False): 

8508 """Remove the volume connection on this host 

8509 

8510 Detach the volume from this instance on this host, and if this is 

8511 the cinder v2 flow, call cinder to terminate the connection. 

8512 """ 

8513 try: 

8514 # NOTE(mriedem): If the BDM was just passed directly we would not 

8515 # need to do this DB query, but this is an RPC interface so 

8516 # changing that requires some care. 

8517 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance( 

8518 context, volume_id, instance.uuid) 

8519 # NOTE(mriedem): Normally we would pass delete_attachment=True to 

8520 # _remove_volume_connection to delete a v3 style volume attachment, 

8521 # but this method is RPC called from _rollback_live_migration which 

8522 # already deletes the attachment, so because of that tight coupling 

8523 # we cannot simply delete a v3 style attachment here without 

8524 # needing to do some behavior modification of that 

8525 # _rollback_live_migration flow which gets messy. 

8526 self._remove_volume_connection( 

8527 context, bdm, instance, delete_attachment) 

8528 except exception.NotFound: 

8529 pass 

8530 

8531 def _remove_volume_connection(self, context, bdm, instance, 

8532 delete_attachment=False): 

8533 """Remove the volume connection on this host 

8534 

8535 Detach the volume from this instance on this host. 

8536 

8537 :param context: nova auth request context 

8538 :param bdm: BlockDeviceMapping object for a volume attached to the 

8539 instance 

8540 :param instance: Instance object with a volume attached represented 

8541 by ``bdm`` 

8542 :param delete_attachment: If ``bdm.attachment_id`` is not None the 

8543 attachment was made as a cinder v3 style attachment and if True, 

8544 then deletes the volume attachment, otherwise just terminates 

8545 the connection for a cinder legacy style connection. 

8546 """ 

8547 driver_bdm = driver_block_device.convert_volume(bdm) 

8548 driver_bdm.driver_detach(context, instance, 

8549 self.volume_api, self.driver) 

8550 if bdm.attachment_id is None: 

8551 # cinder v2 api flow 

8552 connector = self.driver.get_volume_connector(instance) 

8553 self.volume_api.terminate_connection(context, bdm.volume_id, 

8554 connector) 

8555 elif delete_attachment: 

8556 # cinder v3 api flow 

8557 self.volume_api.attachment_delete(context, bdm.attachment_id) 

8558 

8559 def _deallocate_port_resource_for_instance( 

8560 self, 

8561 context: nova.context.RequestContext, 

8562 instance: 'objects.Instance', 

8563 port_id: str, 

8564 port_allocation: ty.Dict[str, ty.Dict[str, ty.Dict[str, int]]], 

8565 ) -> None: 

8566 

8567 if not port_allocation: 

8568 return 

8569 

8570 try: 

8571 client = self.reportclient 

8572 client.remove_resources_from_instance_allocation( 

8573 context, instance.uuid, port_allocation) 

8574 except Exception as ex: 

8575 # We always raise here as it is not a race condition where 

8576 # somebody has already deleted the port we want to cleanup. 

8577 # Here we see that the port exists, the allocation exists, 

8578 # but we cannot clean it up so we will actually leak 

8579 # allocations. 

8580 with excutils.save_and_reraise_exception(): 

8581 LOG.warning( 

8582 'Failed to remove resource allocation of port %(port_id)s ' 

8583 'for instance. Error: %(error)s', 

8584 {'port_id': port_id, 'error': ex}, 

8585 instance=instance) 

8586 

8587 def _deallocate_port_for_instance( 

8588 self, context, instance, port_id, raise_on_failure=False, 

8589 pci_device=None): 

8590 try: 

8591 result = self.network_api.deallocate_port_for_instance( 

8592 context, instance, port_id) 

8593 __, port_allocation = result 

8594 except Exception as ex: 

8595 with excutils.save_and_reraise_exception( 

8596 reraise=raise_on_failure): 

8597 LOG.warning('Failed to deallocate port %(port_id)s ' 

8598 'for instance. Error: %(error)s', 

8599 {'port_id': port_id, 'error': ex}, 

8600 instance=instance) 

8601 else: 

8602 if pci_device: 8602 ↛ 8608line 8602 didn't jump to line 8608 because the condition on line 8602 was always true

8603 self.rt.unclaim_pci_devices(context, pci_device, instance) 

8604 instance.remove_pci_device_and_request(pci_device) 

8605 

8606 # Deallocate the resources in placement that were used by the 

8607 # detached port. 

8608 self._deallocate_port_resource_for_instance( 

8609 context, instance, port_id, port_allocation) 

8610 

8611 def _claim_pci_device_for_interface_attach( 

8612 self, 

8613 context: nova.context.RequestContext, 

8614 instance: 'objects.Instance', 

8615 pci_reqs: 'objects.InstancePCIRequests', 

8616 ) -> ty.Optional['objects.PciDevice']: 

8617 """Claim PCI devices if there are PCI requests 

8618 

8619 :param context: nova.context.RequestContext 

8620 :param instance: the objects.Instance to where the interface is being 

8621 attached 

8622 :param pci_reqs: A InstancePCIRequests object describing the 

8623 needed PCI devices 

8624 :raises InterfaceAttachPciClaimFailed: if the PCI device claim fails 

8625 :returns: An objects.PciDevice describing the claimed PCI device for 

8626 the interface or None if no device is requested 

8627 """ 

8628 

8629 if not pci_reqs.requests: 8629 ↛ 8630line 8629 didn't jump to line 8630 because the condition on line 8629 was never true

8630 return None 

8631 

8632 try: 

8633 devices = self.rt.claim_pci_devices( 

8634 context, pci_reqs, instance.numa_topology) 

8635 except exception.PciDeviceRequestFailed: 

8636 LOG.info('Failed to claim PCI devices during interface attach ' 

8637 'for PCI request %s', pci_reqs, instance=instance) 

8638 raise exception.InterfaceAttachPciClaimFailed( 

8639 instance_uuid=instance.uuid) 

8640 

8641 # NOTE(gibi): We assume that maximum one PCI devices is attached per 

8642 # interface attach request. 

8643 device = devices[0] 

8644 instance.pci_devices.objects.append(device) 

8645 

8646 return device 

8647 

8648 def _allocate_port_resource_for_instance( 

8649 self, 

8650 context: nova.context.RequestContext, 

8651 instance: 'objects.Instance', 

8652 pci_reqs: 'objects.InstancePCIRequests', 

8653 request_groups: ty.List['objects.RequestGroup'], 

8654 request_level_params: 'objects.RequestLevelParams', 

8655 ) -> ty.Tuple[ty.Optional[ty.Dict[str, ty.List[str]]], 

8656 ty.Optional[ty.Dict[str, ty.Dict[str, ty.Dict[str, int]]]]]: 

8657 """Allocate resources for the request in placement 

8658 

8659 :param context: nova.context.RequestContext 

8660 :param instance: the objects.Instance to where the interface is being 

8661 attached 

8662 :param pci_reqs: A list of InstancePCIRequest objects describing the 

8663 needed PCI devices 

8664 :param request_groups: A list of RequestGroup objects describing the 

8665 resources the port requests from placement 

8666 :param request_level_params: A RequestLevelParams object describing the 

8667 non group specific request of the port. 

8668 :raises InterfaceAttachResourceAllocationFailed: if we failed to 

8669 allocate resource in placement for the request 

8670 :returns: A tuple of provider mappings and allocated resources or 

8671 (None, None) if no resource allocation was needed for the request 

8672 """ 

8673 

8674 if not request_groups: 

8675 return None, None 

8676 

8677 # restrict the resource request to the current compute node. The 

8678 # compute node uuid is the uuid of the root provider of the node in 

8679 # placement 

8680 compute_node_uuid = objects.ComputeNode.get_by_nodename( 

8681 context, instance.node).uuid 

8682 # we can have multiple request groups, it would be enough to restrict 

8683 # only one of them to the compute tree but for symmetry we restrict 

8684 # all of them 

8685 for request_group in request_groups: 

8686 request_group.in_tree = compute_node_uuid 

8687 

8688 # NOTE(gibi): group policy is mandatory in a resource request if there 

8689 # are multiple groups. The policy can only come from the flavor today 

8690 # and a new flavor is not provided with an interface attach request and 

8691 # the instance's current flavor might not have a policy. Still we are 

8692 # attaching a single port where currently the two possible groups 

8693 # (one for bandwidth and one for packet rate) will always be allocated 

8694 # from different providers. So both possible policies (none, isolated) 

8695 # are always fulfilled for this single port. We still has to specify 

8696 # one so we specify the least restrictive now. 

8697 rr = scheduler_utils.ResourceRequest.from_request_groups( 

8698 request_groups, request_level_params, group_policy='none') 

8699 res = self.reportclient.get_allocation_candidates(context, rr) 

8700 alloc_reqs, provider_sums, version = res 

8701 

8702 if not alloc_reqs: 

8703 # no allocation candidates available, we run out of free resources 

8704 raise exception.InterfaceAttachResourceAllocationFailed( 

8705 instance_uuid=instance.uuid) 

8706 

8707 # select one of the candidates and update the instance 

8708 # allocation 

8709 # TODO(gibi): We could loop over all possible candidates 

8710 # if the first one selected here does not work due to race or due 

8711 # to not having free PCI devices. However the latter is only 

8712 # detected later in the interface attach code path. 

8713 alloc_req = alloc_reqs[0] 

8714 resources = alloc_req['allocations'] 

8715 provider_mappings = alloc_req['mappings'] 

8716 try: 

8717 self.reportclient.add_resources_to_instance_allocation( 

8718 context, instance.uuid, resources) 

8719 except exception.AllocationUpdateFailed as e: 

8720 # We lost a race. We could retry another candidate 

8721 raise exception.InterfaceAttachResourceAllocationFailed( 

8722 instance_uuid=instance.uuid) from e 

8723 except ( 

8724 exception.ConsumerAllocationRetrievalFailed, 

8725 keystone_exception.ClientException, 

8726 ) as e: 

8727 # These are non-recoverable errors so we should not retry 

8728 raise exception.InterfaceAttachResourceAllocationFailed( 

8729 instance_uuid=instance.uuid) from e 

8730 

8731 try: 

8732 compute_utils.update_pci_request_with_placement_allocations( 

8733 context, 

8734 self.reportclient, 

8735 pci_reqs.requests, 

8736 provider_mappings, 

8737 ) 

8738 except ( 

8739 exception.AmbiguousResourceProviderForPCIRequest, 

8740 exception.UnexpectedResourceProviderNameForPCIRequest 

8741 ): 

8742 # These are programming errors. So we clean up an re-raise to let 

8743 # the request fail 

8744 with excutils.save_and_reraise_exception(): 

8745 self.reportclient.remove_resources_from_instance_allocation( 

8746 context, instance.uuid, resources) 

8747 

8748 return provider_mappings, resources 

8749 

8750 # TODO(mriedem): There are likely race failures which can result in 

8751 # NotFound and QuotaError exceptions getting traced as well. 

8752 @messaging.expected_exceptions( 

8753 # Do not log a traceback for user errors. We use Invalid generically 

8754 # since this method can raise lots of different exceptions: 

8755 # AttachInterfaceNotSupported 

8756 # NetworkInterfaceTaggedAttachNotSupported 

8757 # NetworkAmbiguous 

8758 # PortNotUsable 

8759 # PortInUse 

8760 # PortNotUsableDNS 

8761 # AttachSRIOVPortNotSupported 

8762 # NetworksWithQoSPolicyNotSupported 

8763 # InterfaceAttachResourceAllocationFailed 

8764 exception.Invalid) 

8765 @wrap_exception() 

8766 @wrap_instance_event(prefix='compute') 

8767 @wrap_instance_fault 

8768 def attach_interface(self, context, instance, network_id, port_id, 

8769 requested_ip, tag): 

8770 """Use hotplug to add an network adapter to an instance.""" 

8771 lockname = 'interface-%s-%s' % (instance.uuid, port_id) 

8772 

8773 @utils.synchronized(lockname) 

8774 def do_attach_interface(context, instance, network_id, port_id, 

8775 requested_ip, tag): 

8776 return self._attach_interface(context, instance, network_id, 

8777 port_id, requested_ip, tag) 

8778 

8779 return do_attach_interface(context, instance, network_id, port_id, 

8780 requested_ip, tag) 

8781 

8782 def _attach_interface(self, context, instance, network_id, port_id, 

8783 requested_ip, tag): 

8784 if not self.driver.capabilities.get('supports_attach_interface', 8784 ↛ 8786line 8784 didn't jump to line 8786 because the condition on line 8784 was never true

8785 False): 

8786 raise exception.AttachInterfaceNotSupported( 

8787 instance_uuid=instance.uuid) 

8788 if (tag and not 

8789 self.driver.capabilities.get('supports_tagged_attach_interface', 

8790 False)): 

8791 raise exception.NetworkInterfaceTaggedAttachNotSupported() 

8792 

8793 compute_utils.notify_about_instance_action( 

8794 context, instance, self.host, 

8795 action=fields.NotificationAction.INTERFACE_ATTACH, 

8796 phase=fields.NotificationPhase.START) 

8797 

8798 bind_host_id = self.driver.network_binding_host_id(context, instance) 

8799 

8800 requested_networks = objects.NetworkRequestList( 

8801 objects=[ 

8802 objects.NetworkRequest( 

8803 network_id=network_id, 

8804 port_id=port_id, 

8805 address=requested_ip, 

8806 tag=tag, 

8807 ) 

8808 ] 

8809 ) 

8810 

8811 if len(requested_networks) != 1: 8811 ↛ 8812line 8811 didn't jump to line 8812 because the condition on line 8811 was never true

8812 LOG.warning( 

8813 "Interface attach only supports one interface per attach " 

8814 "request", instance=instance) 

8815 raise exception.InterfaceAttachFailed(instance_uuid=instance.uuid) 

8816 

8817 pci_numa_affinity_policy = hardware.get_pci_numa_policy_constraint( 

8818 instance.flavor, instance.image_meta) 

8819 pci_reqs = objects.InstancePCIRequests( 

8820 requests=[], instance_uuid=instance.uuid) 

8821 _, request_groups, req_lvl_params = ( 

8822 self.network_api.create_resource_requests( 

8823 context, 

8824 requested_networks, 

8825 pci_reqs, 

8826 affinity_policy=pci_numa_affinity_policy 

8827 ) 

8828 ) 

8829 

8830 result = self._allocate_port_resource_for_instance( 

8831 context, instance, pci_reqs, request_groups, req_lvl_params) 

8832 provider_mappings, resources = result 

8833 

8834 try: 

8835 pci_device = self._claim_pci_device_for_interface_attach( 

8836 context, instance, pci_reqs) 

8837 except exception.InterfaceAttachPciClaimFailed: 

8838 with excutils.save_and_reraise_exception(): 

8839 if resources: 8839 ↛ 8845line 8839 didn't jump to line 8845

8840 # TODO(gibi): Instead of giving up we could try another 

8841 # allocation candidate from _allocate_resources() if any 

8842 self._deallocate_port_resource_for_instance( 

8843 context, instance, port_id, resources) 

8844 

8845 instance.pci_requests.requests.extend(pci_reqs.requests) 

8846 

8847 network_info = self.network_api.allocate_for_instance( 

8848 context, 

8849 instance, 

8850 requested_networks, 

8851 bind_host_id=bind_host_id, 

8852 resource_provider_mapping=provider_mappings, 

8853 ) 

8854 

8855 if len(network_info) != 1: 8855 ↛ 8856line 8855 didn't jump to line 8856 because the condition on line 8855 was never true

8856 LOG.error('allocate_for_instance returned %(ports)s ' 

8857 'ports', {'ports': len(network_info)}) 

8858 # TODO(elod.illes): an instance.interface_attach.error notification 

8859 # should be sent here 

8860 raise exception.InterfaceAttachFailed( 

8861 instance_uuid=instance.uuid) 

8862 image_meta = objects.ImageMeta.from_instance(instance) 

8863 

8864 try: 

8865 self.driver.attach_interface(context, instance, image_meta, 

8866 network_info[0]) 

8867 except exception.NovaException as ex: 

8868 port_id = network_info[0].get('id') 

8869 LOG.warning("attach interface failed , try to deallocate " 

8870 "port %(port_id)s, reason: %(msg)s", 

8871 {'port_id': port_id, 'msg': ex}, 

8872 instance=instance) 

8873 self._deallocate_port_for_instance( 

8874 context, instance, port_id, pci_device=pci_device) 

8875 

8876 compute_utils.notify_about_instance_action( 

8877 context, instance, self.host, 

8878 action=fields.NotificationAction.INTERFACE_ATTACH, 

8879 phase=fields.NotificationPhase.ERROR, 

8880 exception=ex) 

8881 

8882 raise exception.InterfaceAttachFailed( 

8883 instance_uuid=instance.uuid) 

8884 

8885 if pci_device: 

8886 # NOTE(gibi): The _claim_pci_device_for_interface_attach() call 

8887 # found a pci device but it only marked the device as claimed. The 

8888 # periodic update_available_resource would move the device to 

8889 # allocated state. But as driver.attach_interface() has been 

8890 # succeeded we now know that the interface is also allocated 

8891 # (used by) to the instance. So make sure the pci tracker also 

8892 # tracks this device as allocated. This way we can avoid a possible 

8893 # race condition when a detach arrives for a device that is only 

8894 # in claimed state. 

8895 self.rt.allocate_pci_devices_for_instance(context, instance) 

8896 

8897 instance.save() 

8898 

8899 compute_utils.notify_about_instance_action( 

8900 context, instance, self.host, 

8901 action=fields.NotificationAction.INTERFACE_ATTACH, 

8902 phase=fields.NotificationPhase.END) 

8903 

8904 return network_info[0] 

8905 

8906 @wrap_exception() 

8907 @wrap_instance_event(prefix='compute') 

8908 @wrap_instance_fault 

8909 def detach_interface(self, context, instance, port_id): 

8910 """Detach a network adapter from an instance.""" 

8911 lockname = 'interface-%s-%s' % (instance.uuid, port_id) 

8912 

8913 @utils.synchronized(lockname) 

8914 def do_detach_interface(context, instance, port_id): 

8915 self._detach_interface(context, instance, port_id) 

8916 

8917 do_detach_interface(context, instance, port_id) 

8918 

8919 def _detach_interface(self, context, instance, port_id): 

8920 # NOTE(aarents): we need to refresh info cache from DB here, 

8921 # as previous detach/attach lock holder just updated it. 

8922 compute_utils.refresh_info_cache_for_instance(context, instance) 

8923 network_info = instance.info_cache.network_info 

8924 condemned = None 

8925 for vif in network_info: 

8926 if vif['id'] == port_id: 8926 ↛ 8925line 8926 didn't jump to line 8925 because the condition on line 8926 was always true

8927 condemned = vif 

8928 break 

8929 if condemned is None: 

8930 raise exception.PortNotFound(_("Port %s is not " 

8931 "attached") % port_id) 

8932 

8933 pci_req = pci_req_module.get_instance_pci_request_from_vif( 

8934 context, instance, condemned) 

8935 

8936 pci_device = None 

8937 if pci_req: 

8938 pci_devices = [pci_device 

8939 for pci_device in instance.pci_devices.objects 

8940 if pci_device.request_id == pci_req.request_id] 

8941 

8942 if not pci_devices: 

8943 LOG.warning( 

8944 "Detach interface failed, port_id=%(port_id)s, " 

8945 "reason: PCI device not found for PCI request %(pci_req)s", 

8946 {'port_id': port_id, 'pci_req': pci_req}) 

8947 raise exception.InterfaceDetachFailed( 

8948 instance_uuid=instance.uuid) 

8949 

8950 pci_device = pci_devices[0] 

8951 

8952 compute_utils.notify_about_instance_action( 

8953 context, instance, self.host, 

8954 action=fields.NotificationAction.INTERFACE_DETACH, 

8955 phase=fields.NotificationPhase.START) 

8956 

8957 try: 

8958 self.driver.detach_interface(context, instance, condemned) 

8959 except exception.NovaException as ex: 

8960 # If the instance was deleted before the interface was detached, 

8961 # just log it at debug. 

8962 log_level = (logging.DEBUG 

8963 if isinstance(ex, exception.InstanceNotFound) 

8964 else logging.WARNING) 

8965 LOG.log(log_level, 

8966 "Detach interface failed, port_id=%(port_id)s, reason: " 

8967 "%(msg)s", {'port_id': port_id, 'msg': ex}, 

8968 instance=instance) 

8969 raise exception.InterfaceDetachFailed(instance_uuid=instance.uuid) 

8970 else: 

8971 self._deallocate_port_for_instance( 

8972 context, instance, port_id, raise_on_failure=True, 

8973 pci_device=pci_device) 

8974 

8975 instance.save() 

8976 

8977 compute_utils.notify_about_instance_action( 

8978 context, instance, self.host, 

8979 action=fields.NotificationAction.INTERFACE_DETACH, 

8980 phase=fields.NotificationPhase.END) 

8981 

8982 def _get_compute_info(self, context, host): 

8983 return objects.ComputeNode.get_first_node_by_host_for_old_compat( 

8984 context, host) 

8985 

8986 @wrap_exception() 

8987 def check_instance_shared_storage(self, ctxt, data): 

8988 """Check if the instance files are shared 

8989 

8990 :param ctxt: security context 

8991 :param data: result of driver.check_instance_shared_storage_local 

8992 

8993 Returns True if instance disks located on shared storage and 

8994 False otherwise. 

8995 """ 

8996 return self.driver.check_instance_shared_storage_remote(ctxt, data) 

8997 

8998 def _dest_can_numa_live_migrate(self, dest_check_data, migration): 

8999 # TODO(artom) If we have a libvirt driver we expect it to set 

9000 # dst_supports_numa_live_migration, but we have to remove it if we 

9001 # did not get a migration from the conductor, indicating that it 

9002 # cannot send RPC 5.3. This check can be removed in RPC 6.0. 

9003 if ('dst_supports_numa_live_migration' in dest_check_data and 

9004 dest_check_data.dst_supports_numa_live_migration and 

9005 not migration): 

9006 delattr(dest_check_data, 'dst_supports_numa_live_migration') 

9007 return dest_check_data 

9008 

9009 @wrap_exception() 

9010 @wrap_instance_event(prefix='compute') 

9011 @wrap_instance_fault 

9012 def check_can_live_migrate_destination(self, ctxt, instance, 

9013 block_migration, disk_over_commit, 

9014 migration, limits): 

9015 """Check if it is possible to execute live migration. 

9016 

9017 This runs checks on the destination host, and then calls 

9018 back to the source host to check the results. 

9019 

9020 :param context: security context 

9021 :param instance: dict of instance data 

9022 :param block_migration: if true, prepare for block migration 

9023 if None, calculate it in driver 

9024 :param disk_over_commit: if true, allow disk over commit 

9025 if None, ignore disk usage checking 

9026 :param migration: objects.Migration object for this live migration. 

9027 :param limits: objects.SchedulerLimits object for this live migration. 

9028 :returns: a LiveMigrateData object (hypervisor-dependent) 

9029 """ 

9030 

9031 # Error out if this host cannot accept the new instance due 

9032 # to anti-affinity. This check at this moment is not very accurate, as 

9033 # multiple requests may be happening concurrently and miss the lock, 

9034 # but when it works it provides a better user experience by failing 

9035 # earlier. Also, it should be safe to explode here, error becomes 

9036 # NoValidHost and instance status remains ACTIVE. 

9037 try: 

9038 self._validate_instance_group_policy(ctxt, instance) 

9039 except exception.RescheduledException as e: 

9040 msg = ("Failed to validate instance group policy " 

9041 "due to: {}".format(e)) 

9042 raise exception.MigrationPreCheckError(reason=msg) 

9043 

9044 src_compute_info = obj_base.obj_to_primitive( 

9045 self._get_compute_info(ctxt, instance.host)) 

9046 dst_compute_info = obj_base.obj_to_primitive( 

9047 self._get_compute_info(ctxt, self.host)) 

9048 dest_check_data = self.driver.check_can_live_migrate_destination(ctxt, 

9049 instance, src_compute_info, dst_compute_info, 

9050 block_migration, disk_over_commit) 

9051 dest_check_data = self._dest_can_numa_live_migrate(dest_check_data, 

9052 migration) 

9053 LOG.debug('destination check data is %s', dest_check_data) 

9054 try: 

9055 allocs = self.reportclient.get_allocations_for_consumer( 

9056 ctxt, instance.uuid) 

9057 migrate_data = self.compute_rpcapi.check_can_live_migrate_source( 

9058 ctxt, instance, dest_check_data) 

9059 if ('src_supports_numa_live_migration' in migrate_data and 

9060 migrate_data.src_supports_numa_live_migration): 

9061 migrate_data = self._live_migration_claim( 

9062 ctxt, instance, migrate_data, migration, limits, allocs) 

9063 elif 'dst_supports_numa_live_migration' in dest_check_data: 9063 ↛ 9073line 9063 didn't jump to line 9073 because the condition on line 9063 was always true

9064 LOG.info('Destination was ready for NUMA live migration, ' 

9065 'but source is either too old, or is set to an ' 

9066 'older upgrade level.', instance=instance) 

9067 

9068 # At this point, we know that this compute node (destination) 

9069 # potentially has enough live-migratable PCI devices, based on the 

9070 # fact that the scheduler selected this host as the destination. 

9071 # The claim code below is the decisive step to move from a 

9072 # potentially correct to a known to be correct destination. 

9073 flavored_pci_reqs = [ 

9074 pci_req 

9075 for pci_req in instance.pci_requests.requests 

9076 if pci_req.source == objects.InstancePCIRequest.FLAVOR_ALIAS 

9077 ] 

9078 

9079 migrate_data.pci_dev_map_src_dst = self._flavor_based_pci_claim( 

9080 ctxt, instance, flavored_pci_reqs 

9081 ) 

9082 

9083 if self.network_api.has_port_binding_extension(ctxt): 

9084 # Create migrate_data vifs if not provided by driver. 

9085 if 'vifs' not in migrate_data: 9085 ↛ 9091line 9085 didn't jump to line 9091 because the condition on line 9085 was always true

9086 migrate_data.vifs = ( 

9087 migrate_data_obj. 

9088 VIFMigrateData.create_skeleton_migrate_vifs( 

9089 instance.get_network_info())) 

9090 # Claim PCI devices for VIFs on destination (if needed) 

9091 port_id_to_pci = self._claim_pci_for_instance_vifs( 

9092 ctxt, instance) 

9093 # Update migrate VIFs with the newly claimed PCI devices 

9094 self._update_migrate_vifs_profile_with_pci( 

9095 migrate_data.vifs, port_id_to_pci) 

9096 # This should always be the last check as we persist some internal 

9097 # dictionary value in the libvirt driver. 

9098 migrate_data = self.driver.check_source_migrate_data_at_dest( 

9099 ctxt, instance, migrate_data, migration, limits, allocs) 

9100 finally: 

9101 self.driver.cleanup_live_migration_destination_check(ctxt, 

9102 dest_check_data) 

9103 return migrate_data 

9104 

9105 def _flavor_based_pci_claim(self, ctxt, instance, pci_requests): 

9106 if not pci_requests: 

9107 # Return an empty dict as migrate_data.pci_dev_map_src_dst 

9108 # cannot be None 

9109 return {} 

9110 

9111 # Create an InstancePCIRequests and claim against PCI resource 

9112 # tracker for out dest instance 

9113 pci_requests = objects.InstancePCIRequests( 

9114 requests=pci_requests, 

9115 instance_uuid=instance.uuid) 

9116 

9117 src_devs = objects.PciDeviceList.get_by_instance_uuid( 

9118 ctxt, instance.uuid 

9119 ) 

9120 

9121 claimed_dst_devs = self._claim_from_pci_reqs( 

9122 ctxt, instance, pci_requests 

9123 ) 

9124 

9125 pci_dev_map_src_dst = {} 

9126 for req_id in ( 

9127 pci_req.request_id for pci_req in pci_requests.requests 

9128 ): 

9129 req_src_addr = [ 

9130 dev.address 

9131 for dev in src_devs 

9132 if dev.request_id == req_id 

9133 ] 

9134 req_claimed_dst__addr = [ 

9135 dev.address 

9136 for dev in claimed_dst_devs 

9137 if dev.request_id == req_id 

9138 ] 

9139 

9140 # This still depends on ordering, but only within the scope 

9141 # of the same InstancePCIRequest. The only case where multiple 

9142 # devices per compute node are allocated for a single request 

9143 # is when req.count > 1. In that case, the allocated devices are 

9144 # interchangeable since they match the same specification from 

9145 # the same InstancePCIRequest. 

9146 # 

9147 # Therefore, this ordering dependency is acceptable. 

9148 pci_dev_map_src_dst.update(dict( 

9149 zip(req_src_addr, req_claimed_dst__addr) 

9150 )) 

9151 

9152 return pci_dev_map_src_dst 

9153 

9154 def _claim_from_pci_reqs(self, ctxt, instance, pci_requests): 

9155 # if we are called during the live migration with NUMA topology 

9156 # support the PCI claim needs to consider the destination NUMA 

9157 # topology that is then stored in the migration_context 

9158 dest_topo = None 

9159 if instance.migration_context: 9159 ↛ 9162line 9159 didn't jump to line 9162 because the condition on line 9159 was always true

9160 dest_topo = instance.migration_context.new_numa_topology 

9161 

9162 claimed_pci_devices_objs = self.rt.claim_pci_devices( 

9163 ctxt, pci_requests, dest_topo) 

9164 

9165 for pci_dev in claimed_pci_devices_objs: 

9166 LOG.debug("PCI device: %s Claimed on destination node", 

9167 pci_dev.address) 

9168 return claimed_pci_devices_objs 

9169 

9170 def _live_migration_claim(self, ctxt, instance, migrate_data, 

9171 migration, limits, allocs): 

9172 """Runs on the destination and does a resources claim, if necessary. 

9173 Currently, only NUMA live migrations require it. 

9174 

9175 :param ctxt: Request context 

9176 :param instance: The Instance being live migrated 

9177 :param migrate_data: The MigrateData object for this live migration 

9178 :param migration: The Migration object for this live migration 

9179 :param limits: The SchedulerLimits object for this live migration 

9180 :returns: migrate_data with dst_numa_info set if necessary 

9181 """ 

9182 try: 

9183 # NOTE(artom) We might have gotten here from _find_destination() in 

9184 # the conductor live migrate task. At that point, 

9185 # migration.dest_node is not set yet (nor should it be, we're still 

9186 # looking for a destination, after all). Therefore, we cannot use 

9187 # migration.dest_node here and must use self._get_nodename(). 

9188 claim = self.rt.live_migration_claim( 

9189 ctxt, instance, self._get_nodename(instance), migration, 

9190 limits, allocs) 

9191 LOG.debug('Created live migration claim.', instance=instance) 

9192 except exception.ComputeResourcesUnavailable as e: 

9193 raise exception.MigrationPreCheckError( 

9194 reason=e.format_message()) 

9195 return self.driver.post_claim_migrate_data(ctxt, instance, 

9196 migrate_data, claim) 

9197 

9198 def _source_can_numa_live_migrate(self, ctxt, dest_check_data, 

9199 source_check_data): 

9200 # TODO(artom) Our virt driver may have told us that it supports NUMA 

9201 # live migration. However, the following other conditions must be met 

9202 # for a NUMA live migration to happen: 

9203 # 1. We got a True dst_supports_numa_live_migration in 

9204 # dest_check_data, indicating that the dest virt driver supports 

9205 # NUMA live migration and that the conductor can send RPC 5.3 and 

9206 # that the destination compute manager can receive it. 

9207 # 2. Ourselves, the source, can send RPC 5.3. There's no 

9208 # sentinel/parameter for this, so we just ask our rpcapi directly. 

9209 # If any of these are not met, we need to remove the 

9210 # src_supports_numa_live_migration flag from source_check_data to avoid 

9211 # incorrectly initiating a NUMA live migration. 

9212 # All of this can be removed in RPC 6.0/objects 2.0. 

9213 can_numa_live_migrate = ( 

9214 'dst_supports_numa_live_migration' in dest_check_data and 

9215 dest_check_data.dst_supports_numa_live_migration and 

9216 self.compute_rpcapi.supports_numa_live_migration(ctxt)) 

9217 if ('src_supports_numa_live_migration' in source_check_data and 

9218 source_check_data.src_supports_numa_live_migration and 

9219 not can_numa_live_migrate): 

9220 delattr(source_check_data, 'src_supports_numa_live_migration') 

9221 return source_check_data 

9222 

9223 @wrap_exception() 

9224 @wrap_instance_event(prefix='compute') 

9225 @wrap_instance_fault 

9226 def check_can_live_migrate_source(self, ctxt, instance, dest_check_data): 

9227 """Check if it is possible to execute live migration. 

9228 

9229 This checks if the live migration can succeed, based on the 

9230 results from check_can_live_migrate_destination. 

9231 

9232 :param ctxt: security context 

9233 :param instance: dict of instance data 

9234 :param dest_check_data: result of check_can_live_migrate_destination 

9235 :returns: a LiveMigrateData object 

9236 """ 

9237 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

9238 ctxt, instance.uuid) 

9239 is_volume_backed = compute_utils.is_volume_backed_instance( 

9240 ctxt, instance, bdms) 

9241 dest_check_data.is_volume_backed = is_volume_backed 

9242 block_device_info = self._get_instance_block_device_info( 

9243 ctxt, instance, refresh_conn_info=False, bdms=bdms) 

9244 result = self.driver.check_can_live_migrate_source(ctxt, instance, 

9245 dest_check_data, 

9246 block_device_info) 

9247 result = self._source_can_numa_live_migrate(ctxt, dest_check_data, 

9248 result) 

9249 LOG.debug('source check data is %s', result) 

9250 return result 

9251 

9252 @wrap_exception() 

9253 @wrap_instance_event(prefix='compute') 

9254 @wrap_instance_fault 

9255 def pre_live_migration(self, context, instance, disk, migrate_data): 

9256 """Preparations for live migration at dest host. 

9257 

9258 :param context: security context 

9259 :param instance: dict of instance data 

9260 :param disk: disk info of instance 

9261 :param migrate_data: A dict or LiveMigrateData object holding data 

9262 required for live migration without shared 

9263 storage. 

9264 :returns: migrate_data containing additional migration info 

9265 """ 

9266 LOG.debug('pre_live_migration data is %s', migrate_data) 

9267 

9268 # Error out if this host cannot accept the new instance due 

9269 # to anti-affinity. At this point the migration is already in-progress, 

9270 # so this is the definitive moment to abort due to the policy 

9271 # violation. Also, it should be safe to explode here. The instance 

9272 # status remains ACTIVE, migration status failed. 

9273 self._validate_instance_group_policy(context, instance) 

9274 

9275 migrate_data.old_vol_attachment_ids = {} 

9276 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

9277 context, instance.uuid) 

9278 network_info = self.network_api.get_instance_nw_info(context, instance) 

9279 self._notify_about_instance_usage( 

9280 context, instance, "live_migration.pre.start", 

9281 network_info=network_info) 

9282 compute_utils.notify_about_instance_action( 

9283 context, instance, self.host, 

9284 action=fields.NotificationAction.LIVE_MIGRATION_PRE, 

9285 phase=fields.NotificationPhase.START, bdms=bdms) 

9286 

9287 connector = None 

9288 try: 

9289 for bdm in bdms: 

9290 if bdm.is_volume and bdm.attachment_id is not None: 

9291 # This bdm uses the new cinder v3.44 API. 

9292 # We will create a new attachment for this 

9293 # volume on this migration destination host. The old 

9294 # attachment will be deleted on the source host 

9295 # when the migration succeeds. The old attachment_id 

9296 # is stored in dict with the key being the bdm.volume_id 

9297 # so it can be restored on rollback. 

9298 # 

9299 # Also note that attachment_update is not needed as we 

9300 # are providing the connector in the create call. 

9301 if connector is None: 

9302 connector = self.driver.get_volume_connector(instance) 

9303 attach_ref = self.volume_api.attachment_create( 

9304 context, bdm.volume_id, bdm.instance_uuid, 

9305 connector=connector, mountpoint=bdm.device_name) 

9306 

9307 # save current attachment so we can detach it on success, 

9308 # or restore it on a rollback. 

9309 # NOTE(mdbooth): This data is no longer used by the source 

9310 # host since change Ibe9215c0. We can't remove it until we 

9311 # are sure the source host has been upgraded. 

9312 migrate_data.old_vol_attachment_ids[bdm.volume_id] = \ 

9313 bdm.attachment_id 

9314 

9315 # update the bdm with the new attachment_id. 

9316 bdm.attachment_id = attach_ref['id'] 

9317 bdm.save() 

9318 

9319 block_device_info = self._get_instance_block_device_info( 

9320 context, instance, refresh_conn_info=True, 

9321 bdms=bdms) 

9322 

9323 # The driver pre_live_migration will plug vifs on the host 

9324 migrate_data = self.driver.pre_live_migration(context, 

9325 instance, 

9326 block_device_info, 

9327 network_info, 

9328 disk, 

9329 migrate_data) 

9330 LOG.debug('driver pre_live_migration data is %s', migrate_data) 

9331 # driver.pre_live_migration is what plugs vifs on the destination 

9332 # host so now we can set the wait_for_vif_plugged flag in the 

9333 # migrate_data object which the source compute will use to 

9334 # determine if it should wait for a 'network-vif-plugged' event 

9335 # from neutron before starting the actual guest transfer in the 

9336 # hypervisor 

9337 using_multiple_port_bindings = ( 

9338 'vifs' in migrate_data and migrate_data.vifs) 

9339 migrate_data.wait_for_vif_plugged = ( 

9340 CONF.compute.live_migration_wait_for_vif_plug and 

9341 using_multiple_port_bindings 

9342 ) 

9343 

9344 # NOTE(tr3buchet): setup networks on destination host 

9345 self.network_api.setup_networks_on_host(context, instance, 

9346 self.host) 

9347 

9348 # NOTE(lyarwood): The above call to driver.pre_live_migration 

9349 # can result in the virt drivers attempting to stash additional 

9350 # metadata into the connection_info of the underlying bdm. 

9351 # Ensure this is saved to the database by calling .save() against 

9352 # the driver BDMs we passed down via block_device_info. 

9353 for driver_bdm in block_device_info['block_device_mapping']: 

9354 driver_bdm.save() 

9355 

9356 except Exception: 

9357 # If we raise, migrate_data with the updated attachment ids 

9358 # will not be returned to the source host for rollback. 

9359 # So we need to rollback new attachments here. 

9360 with excutils.save_and_reraise_exception(): 

9361 old_attachments = migrate_data.old_vol_attachment_ids 

9362 for bdm in bdms: 

9363 if (bdm.is_volume and bdm.attachment_id is not None and 

9364 bdm.volume_id in old_attachments): 

9365 self.volume_api.attachment_delete(context, 

9366 bdm.attachment_id) 

9367 bdm.attachment_id = old_attachments[bdm.volume_id] 

9368 bdm.save() 

9369 

9370 # Volume connections are complete, tell cinder that all the 

9371 # attachments have completed. 

9372 for bdm in bdms: 

9373 if bdm.is_volume and bdm.attachment_id is not None: 

9374 self.volume_api.attachment_complete(context, 

9375 bdm.attachment_id) 

9376 

9377 self._notify_about_instance_usage( 

9378 context, instance, "live_migration.pre.end", 

9379 network_info=network_info) 

9380 compute_utils.notify_about_instance_action( 

9381 context, instance, self.host, 

9382 action=fields.NotificationAction.LIVE_MIGRATION_PRE, 

9383 phase=fields.NotificationPhase.END, bdms=bdms) 

9384 

9385 LOG.debug('pre_live_migration result data is %s', migrate_data) 

9386 return migrate_data 

9387 

9388 @staticmethod 

9389 def _neutron_failed_live_migration_callback(event_name, instance): 

9390 msg = ('Neutron reported failure during live migration ' 

9391 'with %(event)s for instance %(uuid)s') 

9392 msg_args = {'event': event_name, 'uuid': instance.uuid} 

9393 if CONF.vif_plugging_is_fatal: 

9394 raise exception.VirtualInterfacePlugException(msg % msg_args) 

9395 LOG.error(msg, msg_args) 

9396 

9397 @staticmethod 

9398 def _get_neutron_events_for_live_migration(instance): 

9399 # We don't generate events if CONF.vif_plugging_timeout=0 

9400 # meaning that the operator disabled using them. 

9401 if CONF.vif_plugging_timeout: 

9402 return (instance.get_network_info() 

9403 .get_live_migration_plug_time_events()) 

9404 else: 

9405 return [] 

9406 

9407 def _cleanup_pre_live_migration(self, context, dest, instance, 

9408 migration, migrate_data, source_bdms): 

9409 """Helper method for when pre_live_migration fails 

9410 

9411 Sets the migration status to "error" and rolls back the live migration 

9412 setup on the destination host. 

9413 

9414 :param context: The user request context. 

9415 :type context: nova.context.RequestContext 

9416 :param dest: The live migration destination hostname. 

9417 :type dest: str 

9418 :param instance: The instance being live migrated. 

9419 :type instance: nova.objects.Instance 

9420 :param migration: The migration record tracking this live migration. 

9421 :type migration: nova.objects.Migration 

9422 :param migrate_data: Data about the live migration, populated from 

9423 the destination host. 

9424 :type migrate_data: Subclass of nova.objects.LiveMigrateData 

9425 :param source_bdms: BDMs prior to modification by the destination 

9426 compute host. Set by _do_live_migration and not 

9427 part of the callback interface, so this is never 

9428 None 

9429 """ 

9430 self._set_migration_status(migration, 'error') 

9431 # Make sure we set this for _rollback_live_migration() 

9432 # so it can find it, as expected if it was called later 

9433 migrate_data.migration = migration 

9434 self._rollback_live_migration(context, instance, dest, 

9435 migrate_data=migrate_data, 

9436 source_bdms=source_bdms, 

9437 pre_live_migration=True) 

9438 

9439 def _do_pre_live_migration_from_source(self, context, dest, instance, 

9440 block_migration, migration, 

9441 migrate_data, source_bdms): 

9442 """Prepares for pre-live-migration on the source host and calls dest 

9443 

9444 Will setup a callback networking event handler (if configured) and 

9445 then call the dest host's pre_live_migration method to prepare the 

9446 dest host for live migration (plugs vifs, connect volumes, etc). 

9447 

9448 _rollback_live_migration (on the source) will be called if 

9449 pre_live_migration (on the dest) fails. 

9450 

9451 :param context: nova auth request context for this operation 

9452 :param dest: name of the destination compute service host 

9453 :param instance: Instance object being live migrated 

9454 :param block_migration: If true, prepare for block migration. 

9455 :param migration: Migration object tracking this operation 

9456 :param migrate_data: MigrateData object for this operation populated 

9457 by the destination host compute driver as part of the 

9458 check_can_live_migrate_destination call. 

9459 :param source_bdms: BlockDeviceMappingList of BDMs currently attached 

9460 to the instance from the source host. 

9461 :returns: MigrateData object which is a modified version of the 

9462 ``migrate_data`` argument from the compute driver on the dest 

9463 host during the ``pre_live_migration`` call. 

9464 :raises: MigrationError if waiting for the network-vif-plugged event 

9465 timed out and is fatal. 

9466 """ 

9467 class _BreakWaitForInstanceEvent(Exception): 

9468 """Used as a signal to stop waiting for the network-vif-plugged 

9469 event when we discover that 

9470 [compute]/live_migration_wait_for_vif_plug is not set on the 

9471 destination. 

9472 """ 

9473 pass 

9474 

9475 events = self._get_neutron_events_for_live_migration(instance) 

9476 try: 

9477 if ('block_migration' in migrate_data and 

9478 migrate_data.block_migration): 

9479 block_device_info = self._get_instance_block_device_info( 

9480 context, instance, bdms=source_bdms) 

9481 disk = self.driver.get_instance_disk_info( 

9482 instance, block_device_info=block_device_info) 

9483 else: 

9484 disk = None 

9485 

9486 deadline = CONF.vif_plugging_timeout 

9487 error_cb = self._neutron_failed_live_migration_callback 

9488 # In order to avoid a race with the vif plugging that the virt 

9489 # driver does on the destination host, we register our events 

9490 # to wait for before calling pre_live_migration. Then if the 

9491 # dest host reports back that we shouldn't wait, we can break 

9492 # out of the context manager using _BreakWaitForInstanceEvent. 

9493 with self.virtapi.wait_for_instance_event( 

9494 instance, events, deadline=deadline, 

9495 error_callback=error_cb): 

9496 with timeutils.StopWatch() as timer: 

9497 # TODO(mriedem): The "block_migration" parameter passed 

9498 # here is not actually used in pre_live_migration but it 

9499 # is not optional in the RPC interface either. 

9500 migrate_data = self.compute_rpcapi.pre_live_migration( 

9501 context, instance, 

9502 block_migration, disk, dest, migrate_data) 

9503 LOG.info('Took %0.2f seconds for pre_live_migration on ' 

9504 'destination host %s.', 

9505 timer.elapsed(), dest, instance=instance) 

9506 wait_for_vif_plugged = ( 

9507 'wait_for_vif_plugged' in migrate_data and 

9508 migrate_data.wait_for_vif_plugged) 

9509 if events and not wait_for_vif_plugged: 

9510 raise _BreakWaitForInstanceEvent 

9511 except _BreakWaitForInstanceEvent: 

9512 if events: 9512 ↛ 9548line 9512 didn't jump to line 9548 because the condition on line 9512 was always true

9513 LOG.debug('Not waiting for events after pre_live_migration: ' 

9514 '%s. ', events, instance=instance) 

9515 except exception.VirtualInterfacePlugException: 

9516 with excutils.save_and_reraise_exception(): 

9517 LOG.exception('Failed waiting for network virtual interfaces ' 

9518 'to be plugged on the destination host %s.', 

9519 dest, instance=instance) 

9520 self._cleanup_pre_live_migration( 

9521 context, dest, instance, migration, migrate_data, 

9522 source_bdms) 

9523 except eventlet.timeout.Timeout: 

9524 # We only get here if wait_for_vif_plugged is True which means 

9525 # live_migration_wait_for_vif_plug=True on the destination host. 

9526 msg = ( 

9527 'Timed out waiting for events: %(events)s. If these timeouts ' 

9528 'are a persistent issue it could mean the networking backend ' 

9529 'on host %(dest)s does not support sending these events ' 

9530 'unless there are port binding host changes which does not ' 

9531 'happen at this point in the live migration process. You may ' 

9532 'need to disable the live_migration_wait_for_vif_plug option ' 

9533 'on host %(dest)s.') 

9534 subs = {'events': events, 'dest': dest} 

9535 LOG.warning(msg, subs, instance=instance) 

9536 if CONF.vif_plugging_is_fatal: 

9537 self._cleanup_pre_live_migration( 

9538 context, dest, instance, migration, migrate_data, 

9539 source_bdms) 

9540 raise exception.MigrationError(reason=msg % subs) 

9541 except Exception: 

9542 with excutils.save_and_reraise_exception(): 

9543 LOG.exception('Pre live migration failed at %s', 

9544 dest, instance=instance) 

9545 self._cleanup_pre_live_migration( 

9546 context, dest, instance, migration, migrate_data, 

9547 source_bdms) 

9548 return migrate_data 

9549 

9550 def _do_live_migration(self, context, dest, instance, block_migration, 

9551 migration, migrate_data): 

9552 # NOTE(danms): We should enhance the RT to account for migrations 

9553 # and use the status field to denote when the accounting has been 

9554 # done on source/destination. For now, this is just here for status 

9555 # reporting 

9556 self._set_migration_status(migration, 'preparing') 

9557 source_bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

9558 context, instance.uuid) 

9559 

9560 migrate_data = self._do_pre_live_migration_from_source( 

9561 context, dest, instance, block_migration, migration, migrate_data, 

9562 source_bdms) 

9563 

9564 # Set migrate_data.migration because that is how _post_live_migration 

9565 # and _rollback_live_migration get the migration object for cleanup. 

9566 # Yes this is gross but changing the _post_live_migration and 

9567 # _rollback_live_migration interfaces would also mean changing how the 

9568 # virt drivers call them from the driver.live_migration method, i.e. 

9569 # we would have to pass the migration object through the driver (or 

9570 # consider using a partial but some do not like that pattern). 

9571 migrate_data.migration = migration 

9572 

9573 # NOTE(Kevin_Zheng): Pop the migration from the waiting queue 

9574 # if it exist in the queue, then we are good to moving on, if 

9575 # not, some other process must have aborted it, then we should 

9576 # rollback. 

9577 try: 

9578 self._waiting_live_migrations.pop(instance.uuid) 

9579 except KeyError: 

9580 LOG.debug('Migration %s aborted by another process, rollback.', 

9581 migration.uuid, instance=instance) 

9582 self._rollback_live_migration(context, instance, dest, 

9583 migrate_data, 'cancelled', 

9584 source_bdms=source_bdms) 

9585 self._notify_live_migrate_abort_end(context, instance) 

9586 return 

9587 

9588 self._set_migration_status(migration, 'running') 

9589 

9590 # NOTE(mdbooth): pre_live_migration will update connection_info and 

9591 # attachment_id on all volume BDMS to reflect the new destination 

9592 # host attachment. We fetch BDMs before that to retain connection_info 

9593 # and attachment_id relating to the source host for post migration 

9594 # cleanup. 

9595 post_live_migration = functools.partial( 

9596 self._post_live_migration_update_host, source_bdms=source_bdms 

9597 ) 

9598 rollback_live_migration = functools.partial( 

9599 self._rollback_live_migration, source_bdms=source_bdms) 

9600 

9601 LOG.debug('live_migration data is %s', migrate_data) 

9602 try: 

9603 self.driver.live_migration(context, instance, dest, 

9604 post_live_migration, 

9605 rollback_live_migration, 

9606 block_migration, migrate_data) 

9607 except Exception: 

9608 LOG.exception('Live migration failed.', instance=instance) 

9609 with excutils.save_and_reraise_exception(): 

9610 # Put instance and migration into error state, 

9611 # as its almost certainly too late to rollback 

9612 self._set_migration_status(migration, 'error') 

9613 # first refresh instance as it may have got updated by 

9614 # post_live_migration_at_destination 

9615 instance.refresh() 

9616 self._set_instance_obj_error_state(instance, 

9617 clean_task_state=True) 

9618 

9619 @wrap_exception() 

9620 @wrap_instance_event(prefix='compute') 

9621 @errors_out_migration 

9622 @wrap_instance_fault 

9623 def live_migration(self, context, dest, instance, block_migration, 

9624 migration, migrate_data): 

9625 """Executing live migration. 

9626 

9627 :param context: security context 

9628 :param dest: destination host 

9629 :param instance: a nova.objects.instance.Instance object 

9630 :param block_migration: if true, prepare for block migration 

9631 :param migration: an nova.objects.Migration object 

9632 :param migrate_data: implementation specific params 

9633 

9634 """ 

9635 self._set_migration_status(migration, 'queued') 

9636 # NOTE(Kevin_Zheng): Submit the live_migration job to the pool and 

9637 # put the returned Future object into dict mapped with migration.uuid 

9638 # in order to be able to track and abort it in the future. 

9639 self._waiting_live_migrations[instance.uuid] = (None, None) 

9640 try: 

9641 future = nova.utils.pass_context( 

9642 self._live_migration_executor.submit, 

9643 self._do_live_migration, context, dest, instance, 

9644 block_migration, migration, migrate_data) 

9645 self._waiting_live_migrations[instance.uuid] = (migration, future) 

9646 except RuntimeError: 

9647 # GreenThreadPoolExecutor.submit will raise RuntimeError if the 

9648 # pool is shutdown, which happens in 

9649 # _cleanup_live_migrations_in_pool. 

9650 LOG.info('Migration %s failed to submit as the compute service ' 

9651 'is shutting down.', migration.uuid, instance=instance) 

9652 raise exception.LiveMigrationNotSubmitted( 

9653 migration_uuid=migration.uuid, instance_uuid=instance.uuid) 

9654 

9655 @wrap_exception() 

9656 @wrap_instance_event(prefix='compute') 

9657 @wrap_instance_fault 

9658 def live_migration_force_complete(self, context, instance): 

9659 """Force live migration to complete. 

9660 

9661 :param context: Security context 

9662 :param instance: The instance that is being migrated 

9663 """ 

9664 

9665 self._notify_about_instance_usage( 

9666 context, instance, 'live.migration.force.complete.start') 

9667 compute_utils.notify_about_instance_action( 

9668 context, instance, self.host, 

9669 action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE, 

9670 phase=fields.NotificationPhase.START) 

9671 self.driver.live_migration_force_complete(instance) 

9672 self._notify_about_instance_usage( 

9673 context, instance, 'live.migration.force.complete.end') 

9674 compute_utils.notify_about_instance_action( 

9675 context, instance, self.host, 

9676 action=fields.NotificationAction.LIVE_MIGRATION_FORCE_COMPLETE, 

9677 phase=fields.NotificationPhase.END) 

9678 

9679 def _notify_live_migrate_abort_end(self, context, instance): 

9680 self._notify_about_instance_usage( 

9681 context, instance, 'live.migration.abort.end') 

9682 compute_utils.notify_about_instance_action( 

9683 context, instance, self.host, 

9684 action=fields.NotificationAction.LIVE_MIGRATION_ABORT, 

9685 phase=fields.NotificationPhase.END) 

9686 

9687 @wrap_exception() 

9688 @wrap_instance_event(prefix='compute') 

9689 @wrap_instance_fault 

9690 def live_migration_abort(self, context, instance, migration_id): 

9691 """Abort an in-progress live migration. 

9692 

9693 :param context: Security context 

9694 :param instance: The instance that is being migrated 

9695 :param migration_id: ID of in-progress live migration 

9696 

9697 """ 

9698 self._notify_about_instance_usage( 

9699 context, instance, 'live.migration.abort.start') 

9700 compute_utils.notify_about_instance_action( 

9701 context, instance, self.host, 

9702 action=fields.NotificationAction.LIVE_MIGRATION_ABORT, 

9703 phase=fields.NotificationPhase.START) 

9704 # NOTE(Kevin_Zheng): Pop the migration out from the queue, this might 

9705 # lead to 3 scenarios: 

9706 # 1. The selected migration is still in queue, and the future.cancel() 

9707 # succeed, then the abort action is succeed, mark the migration 

9708 # status to 'cancelled'. 

9709 # 2. The selected migration is still in queue, but the future.cancel() 

9710 # failed, then the _do_live_migration() has started executing, and 

9711 # the migration status is 'preparing', then we just pop it from the 

9712 # queue, and the migration process will handle it later. And the 

9713 # migration status couldn't be 'running' in this scenario because 

9714 # if _do_live_migration has started executing and we've already 

9715 # popped it from the queue and set the migration status to 

9716 # 'running' at this point, popping it here will raise KeyError at 

9717 # which point we check if it's running and if so, we abort the old 

9718 # way. 

9719 # 3. The selected migration is not in the queue, then the migration 

9720 # status is 'running', let the driver handle it. 

9721 try: 

9722 migration, future = ( 

9723 self._waiting_live_migrations.pop(instance.uuid)) 

9724 if future and future.cancel(): 9724 ↛ 9768line 9724 didn't jump to line 9768 because the condition on line 9724 was always true

9725 # If we got here, we've successfully dropped a queued 

9726 # migration from the queue, so _do_live_migration won't run 

9727 # and we only need to revert minor changes introduced by Nova 

9728 # control plane (port bindings, resource allocations and 

9729 # instance's PCI devices), restore VM's state, set the 

9730 # migration's status to cancelled and send the notification. 

9731 # If Future.cancel() fails, it means _do_live_migration is 

9732 # running and the migration status is preparing, and 

9733 # _do_live_migration() itself will attempt to pop the queued 

9734 # migration, hit a KeyError, and rollback, set the migration 

9735 # to cancelled and send the live.migration.abort.end 

9736 # notification. 

9737 self._revert_allocation(context, instance, migration) 

9738 try: 

9739 # This call will delete any inactive destination host 

9740 # port bindings. 

9741 self.network_api.setup_networks_on_host( 

9742 context, instance, host=migration.dest_compute, 

9743 teardown=True) 

9744 except exception.PortBindingDeletionFailed as e: 

9745 # Removing the inactive port bindings from the destination 

9746 # host is not critical so just log an error but don't fail. 

9747 LOG.error( 

9748 'Network cleanup failed for destination host %s ' 

9749 'during live migration rollback. You may need to ' 

9750 'manually clean up resources in the network service. ' 

9751 'Error: %s', migration.dest_compute, str(e)) 

9752 except Exception: 

9753 with excutils.save_and_reraise_exception(): 

9754 LOG.exception( 

9755 'An error occurred while cleaning up networking ' 

9756 'during live migration rollback.', 

9757 instance=instance) 

9758 instance.task_state = None 

9759 instance.save(expected_task_state=[task_states.MIGRATING]) 

9760 self._set_migration_status(migration, 'cancelled') 

9761 except KeyError: 

9762 migration = objects.Migration.get_by_id(context, migration_id) 

9763 if migration.status != 'running': 

9764 raise exception.InvalidMigrationState( 

9765 migration_id=migration_id, instance_uuid=instance.uuid, 

9766 state=migration.status, method='abort live migration') 

9767 self.driver.live_migration_abort(instance) 

9768 self._notify_live_migrate_abort_end(context, instance) 

9769 

9770 def _live_migration_cleanup_flags(self, migrate_data, migr_ctxt=None): 

9771 """Determine whether disks, instance path or other resources 

9772 need to be cleaned up after live migration (at source on success, 

9773 at destination on rollback) 

9774 

9775 Block migration needs empty image at destination host before migration 

9776 starts, so if any failure occurs, any empty images has to be deleted. 

9777 

9778 Also Volume backed live migration w/o shared storage needs to delete 

9779 newly created instance-xxx dir on the destination as a part of its 

9780 rollback process 

9781 

9782 There may be other resources which need cleanup; currently this is 

9783 limited to vPMEM and mdev devices with the libvirt driver. 

9784 

9785 :param migrate_data: implementation specific data 

9786 :param migr_ctxt: specific resources stored in migration_context 

9787 :returns: (bool, bool) -- do_cleanup, destroy_disks 

9788 """ 

9789 # NOTE(pkoniszewski): block migration specific params are set inside 

9790 # migrate_data objects for drivers that expose block live migration 

9791 # information (i.e. Libvirt, HyperV). For other drivers cleanup is not 

9792 # needed. 

9793 do_cleanup = False 

9794 destroy_disks = False 

9795 if isinstance(migrate_data, migrate_data_obj.LibvirtLiveMigrateData): 

9796 has_vpmem = False 

9797 if migr_ctxt and migr_ctxt.old_resources: 

9798 for resource in migr_ctxt.old_resources: 9798 ↛ 9804line 9798 didn't jump to line 9804 because the loop on line 9798 didn't complete

9799 if ('metadata' in resource and 9799 ↛ 9798line 9799 didn't jump to line 9798 because the condition on line 9799 was always true

9800 isinstance(resource.metadata, 

9801 objects.LibvirtVPMEMDevice)): 

9802 has_vpmem = True 

9803 break 

9804 has_mdevs = 'target_mdevs' in migrate_data 

9805 power_management_possible = ( 

9806 'dst_numa_info' in migrate_data and 

9807 migrate_data.dst_numa_info is not None) 

9808 # No instance booting at source host, but instance dir 

9809 # must be deleted for preparing next block migration 

9810 # must be deleted for preparing next live migration w/o shared 

9811 # storage 

9812 # vpmem must be cleaned 

9813 do_cleanup = (not migrate_data.is_shared_instance_path or 

9814 has_vpmem or has_mdevs or power_management_possible) 

9815 destroy_disks = not ( 

9816 migrate_data.is_shared_block_storage or 

9817 migrate_data.is_shared_instance_path) 

9818 elif isinstance(migrate_data, migrate_data_obj.HyperVLiveMigrateData): 

9819 # NOTE(claudiub): We need to cleanup any zombie Planned VM. 

9820 do_cleanup = True 

9821 destroy_disks = not migrate_data.is_shared_instance_path 

9822 

9823 return (do_cleanup, destroy_disks) 

9824 

9825 def _post_live_migration_remove_source_vol_connections( 

9826 self, context, instance, source_bdms): 

9827 """Disconnect volume connections from the source host during 

9828 _post_live_migration. 

9829 

9830 :param context: nova auth RequestContext 

9831 :param instance: Instance object being live migrated 

9832 :param source_bdms: BlockDeviceMappingList representing the attached 

9833 volumes with connection_info set for the source host 

9834 """ 

9835 # Detaching volumes. 

9836 connector = None 

9837 for bdm in source_bdms: 

9838 if bdm.is_volume: 

9839 # Detaching volumes is a call to an external API that can fail. 

9840 # If it does, we need to handle it gracefully so that the call 

9841 # to post_live_migration_at_destination - where we set instance 

9842 # host and task state - still happens. We need to rethink the 

9843 # current approach of setting instance host and task state 

9844 # AFTER a whole bunch of things that could fail in unhandled 

9845 # ways, but that is left as a TODO(artom). 

9846 try: 

9847 if bdm.attachment_id is None: 

9848 # Prior to cinder v3.44: 

9849 # We don't want to actually mark the volume detached, 

9850 # or delete the bdm, just remove the connection from 

9851 # this host. 

9852 # 

9853 # remove the volume connection without detaching from 

9854 # hypervisor because the instance is not running 

9855 # anymore on the current host 

9856 if connector is None: 9856 ↛ 9859line 9856 didn't jump to line 9859 because the condition on line 9856 was always true

9857 connector = self.driver.get_volume_connector( 

9858 instance) 

9859 self.volume_api.terminate_connection(context, 

9860 bdm.volume_id, 

9861 connector) 

9862 else: 

9863 # cinder v3.44 api flow - delete the old attachment 

9864 # for the source host 

9865 self.volume_api.attachment_delete(context, 

9866 bdm.attachment_id) 

9867 

9868 except Exception as e: 

9869 if bdm.attachment_id is None: 

9870 LOG.error('Connection for volume %s not terminated on ' 

9871 'source host %s during post_live_migration: ' 

9872 '%s', bdm.volume_id, self.host, 

9873 str(e), instance=instance) 

9874 else: 

9875 LOG.error('Volume attachment %s not deleted on source ' 

9876 'host %s during post_live_migration: %s', 

9877 bdm.attachment_id, self.host, 

9878 str(e), instance=instance) 

9879 

9880 # TODO(sean-k-mooney): add typing 

9881 def _post_live_migration_update_host( 

9882 self, ctxt, instance, dest, block_migration=False, 

9883 migrate_data=None, source_bdms=None 

9884 ): 

9885 try: 

9886 self._post_live_migration( 

9887 ctxt, instance, dest, block_migration, migrate_data, 

9888 source_bdms) 

9889 except Exception: 

9890 # Restore the instance object 

9891 compute_node = None 

9892 node_name = None 

9893 try: 

9894 # get node name of compute, where instance will be 

9895 # running after migration, that is destination host 

9896 compute_node = self._get_compute_info(ctxt, dest) 

9897 node_name = compute_node.hypervisor_hostname 

9898 except exception.ComputeHostNotFound: 

9899 LOG.exception('Failed to get compute_info for %s', dest) 

9900 

9901 # we can never rollback from post live migration and we can only 

9902 # get here if the instance is running on the dest so we ensure 

9903 # the instance.host is set correctly and reraise the original 

9904 # exception unmodified. 

9905 if instance.host != dest: 9905 ↛ 9916line 9905 didn't jump to line 9916 because the condition on line 9905 was always true

9906 # apply saves the new fields while drop actually removes the 

9907 # migration context from the instance, so migration persists. 

9908 instance.apply_migration_context() 

9909 instance.drop_migration_context() 

9910 instance.host = dest 

9911 instance.task_state = None 

9912 instance.node = node_name 

9913 instance.compute_id = compute_node and compute_node.id or None 

9914 instance.progress = 0 

9915 instance.save() 

9916 raise 

9917 

9918 @wrap_exception() 

9919 @wrap_instance_fault 

9920 def _post_live_migration(self, ctxt, instance, dest, 

9921 block_migration=False, migrate_data=None, 

9922 source_bdms=None): 

9923 """Post operations for live migration. 

9924 

9925 This method is called from live_migration 

9926 and mainly updating database record. 

9927 

9928 :param ctxt: security context 

9929 :param instance: instance object 

9930 :param dest: destination host 

9931 :param block_migration: if true, prepare for block migration 

9932 :param migrate_data: if not None, it is a dict which has data 

9933 :param source_bdms: BDMs prior to modification by the destination 

9934 compute host. Set by _do_live_migration and not 

9935 part of the callback interface, so this is never 

9936 None 

9937 required for live migration without shared storage 

9938 

9939 """ 

9940 LOG.info('_post_live_migration() is started..', 

9941 instance=instance) 

9942 

9943 # NOTE(artom) The ordering and exception handling are important here. 

9944 # We want to immediately activate the port bindings on the destination 

9945 # host to minimize network downtime. This happens in 

9946 # migrate_instance_start(). It's also crucial that we call 

9947 # _post_live_migration_remove_source_vol_connections() to clean up 

9948 # source volume connections and prevent potential data leaks. We 

9949 # therefore activate the port bindings in a try block, and, regardless 

9950 # of any exceptions during that process, clean up volume connections in 

9951 # a finally block. 

9952 try: 

9953 # NOTE(artom) At this point in time we have not bound the ports to 

9954 # the destination host yet (this happens in 

9955 # migrate_instance_start() below). Therefore, the "old" source 

9956 # network info that's still in the instance info cache is safe to 

9957 # use here, since it'll be used below during 

9958 # driver.post_live_migration_at_source() to unplug the VIFs on the 

9959 # source. 

9960 network_info = instance.get_network_info() 

9961 

9962 self._notify_about_instance_usage(ctxt, instance, 

9963 "live_migration._post.start", 

9964 network_info=network_info, 

9965 best_effort=True) 

9966 compute_utils.notify_about_instance_action( 

9967 ctxt, instance, self.host, 

9968 action=fields.NotificationAction.LIVE_MIGRATION_POST, 

9969 phase=fields.NotificationPhase.START, best_effort=True) 

9970 

9971 migration = objects.Migration( 

9972 source_compute=self.host, dest_compute=dest, 

9973 ) 

9974 # For neutron, migrate_instance_start will activate the destination 

9975 # host port bindings, if there are any created by conductor before 

9976 # live migration started. 

9977 self.network_api.migrate_instance_start(ctxt, instance, migration) 

9978 finally: 

9979 # Cleanup source host post live-migration 

9980 block_device_info = self._get_instance_block_device_info( 

9981 ctxt, instance, bdms=source_bdms) 

9982 self.driver.post_live_migration(ctxt, instance, block_device_info, 

9983 migrate_data) 

9984 

9985 # Disconnect volumes from this (the source) host. 

9986 self._post_live_migration_remove_source_vol_connections( 

9987 ctxt, instance, source_bdms) 

9988 

9989 destroy_vifs = False 

9990 try: 

9991 # It's possible that the vif type changed on the destination 

9992 # host and is already bound and active, so we need to use the 

9993 # stashed source vifs in migrate_data.vifs (if present) to unplug 

9994 # on the source host. 

9995 unplug_nw_info = network_info 

9996 if migrate_data and 'vifs' in migrate_data: 

9997 nw_info = [] 

9998 for migrate_vif in migrate_data.vifs: 

9999 nw_info.append(migrate_vif.source_vif) 

10000 unplug_nw_info = network_model.NetworkInfo.hydrate(nw_info) 

10001 LOG.debug('Calling driver.post_live_migration_at_source ' 

10002 'with original source VIFs from migrate_data: %s', 

10003 unplug_nw_info, instance=instance) 

10004 self.driver.post_live_migration_at_source(ctxt, instance, 

10005 unplug_nw_info) 

10006 except NotImplementedError as ex: 

10007 LOG.debug(ex, instance=instance) 

10008 # For all hypervisors other than libvirt, there is a possibility 

10009 # they are unplugging networks from source node in the cleanup 

10010 # method 

10011 destroy_vifs = True 

10012 

10013 # Free instance allocations on source before claims are allocated on 

10014 # destination node 

10015 self.rt.free_pci_device_allocations_for_instance(ctxt, instance) 

10016 # NOTE(danms): Save source node before calling post method on 

10017 # destination, which will update it 

10018 source_node = instance.node 

10019 

10020 do_cleanup, destroy_disks = self._live_migration_cleanup_flags( 

10021 migrate_data, migr_ctxt=instance.migration_context) 

10022 

10023 if do_cleanup: 

10024 LOG.debug('Calling driver.cleanup from _post_live_migration', 

10025 instance=instance) 

10026 self.driver.cleanup(ctxt, instance, unplug_nw_info, 

10027 destroy_disks=destroy_disks, 

10028 migrate_data=migrate_data, 

10029 destroy_vifs=destroy_vifs) 

10030 

10031 # Define domain at destination host, without doing it, 

10032 # pause/suspend/terminate do not work. 

10033 post_at_dest_success = True 

10034 try: 

10035 self.compute_rpcapi.post_live_migration_at_destination(ctxt, 

10036 instance, block_migration, dest) 

10037 except Exception as error: 

10038 post_at_dest_success = False 

10039 # If post_live_migration_at_destination() fails, we now have the 

10040 # _post_live_migration_update_host() method that will handle 

10041 # this case. 

10042 LOG.exception("Post live migration at destination %s failed", 

10043 dest, instance=instance, error=error) 

10044 raise 

10045 

10046 self.instance_events.clear_events_for_instance(instance) 

10047 

10048 # NOTE(timello): make sure we update available resources on source 

10049 # host even before next periodic task. 

10050 self.update_available_resource(ctxt) 

10051 

10052 self._update_scheduler_instance_info(ctxt, instance) 

10053 self._notify_about_instance_usage(ctxt, instance, 

10054 "live_migration._post.end", 

10055 network_info=network_info) 

10056 compute_utils.notify_about_instance_action( 

10057 ctxt, instance, self.host, 

10058 action=fields.NotificationAction.LIVE_MIGRATION_POST, 

10059 phase=fields.NotificationPhase.END) 

10060 if post_at_dest_success: 10060 ↛ 10064line 10060 didn't jump to line 10064 because the condition on line 10060 was always true

10061 LOG.info('Migrating instance to %s finished successfully.', 

10062 dest, instance=instance) 

10063 

10064 self._clean_instance_console_tokens(ctxt, instance) 

10065 if migrate_data and migrate_data.obj_attr_is_set('migration'): 

10066 migrate_data.migration.status = 'completed' 

10067 migrate_data.migration.save() 

10068 self._delete_allocation_after_move(ctxt, 

10069 instance, 

10070 migrate_data.migration) 

10071 else: 

10072 # We didn't have data on a migration, which means we can't 

10073 # look up to see if we had new-style migration-based 

10074 # allocations. This should really only happen in cases of 

10075 # a buggy virt driver. Log a warning so we know it happened. 

10076 LOG.warning('Live migration ended with no migrate_data ' 

10077 'record. Unable to clean up migration-based ' 

10078 'allocations for node %s which is almost certainly ' 

10079 'not an expected situation.', source_node, 

10080 instance=instance) 

10081 

10082 def _consoles_enabled(self): 

10083 """Returns whether a console is enable.""" 

10084 return (CONF.vnc.enabled or CONF.spice.enabled or 

10085 CONF.serial_console.enabled or 

10086 CONF.mks.enabled) 

10087 

10088 def _clean_instance_console_tokens(self, ctxt, instance): 

10089 """Clean console tokens stored for an instance.""" 

10090 # If the database backend isn't in use, don't bother trying to clean 

10091 # tokens. 

10092 if self._consoles_enabled(): 

10093 objects.ConsoleAuthToken.\ 

10094 clean_console_auths_for_instance(ctxt, instance.uuid) 

10095 

10096 @wrap_exception() 

10097 @wrap_instance_event(prefix='compute') 

10098 @wrap_instance_fault 

10099 def post_live_migration_at_destination(self, context, instance, 

10100 block_migration): 

10101 """Post operations for live migration . 

10102 

10103 :param context: security context 

10104 :param instance: Instance dict 

10105 :param block_migration: if true, prepare for block migration 

10106 

10107 """ 

10108 LOG.info('Post operation of migration started', 

10109 instance=instance) 

10110 

10111 # NOTE(tr3buchet): setup networks on destination host 

10112 # this is called a second time because 

10113 # multi_host does not create the bridge in 

10114 # plug_vifs 

10115 # NOTE(mriedem): This is a no-op for neutron. 

10116 self.network_api.setup_networks_on_host(context, instance, 

10117 self.host) 

10118 migration = objects.Migration( 

10119 source_compute=instance.host, 

10120 dest_compute=self.host, 

10121 migration_type=fields.MigrationType.LIVE_MIGRATION) 

10122 self.network_api.migrate_instance_finish( 

10123 context, instance, migration, provider_mappings=None) 

10124 

10125 network_info = self.network_api.get_instance_nw_info(context, instance) 

10126 self._notify_about_instance_usage( 

10127 context, instance, "live_migration.post.dest.start", 

10128 network_info=network_info) 

10129 compute_utils.notify_about_instance_action(context, instance, 

10130 self.host, 

10131 action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST, 

10132 phase=fields.NotificationPhase.START) 

10133 block_device_info = self._get_instance_block_device_info(context, 

10134 instance) 

10135 # Allocate the claimed PCI resources at destination. 

10136 self.rt.allocate_pci_devices_for_instance(context, instance) 

10137 

10138 try: 

10139 self.driver.post_live_migration_at_destination( 

10140 context, instance, network_info, block_migration, 

10141 block_device_info) 

10142 except Exception: 

10143 with excutils.save_and_reraise_exception(): 

10144 instance.vm_state = vm_states.ERROR 

10145 LOG.error('Unexpected error during post live migration at ' 

10146 'destination host.', instance=instance) 

10147 finally: 

10148 # Restore instance state and update host 

10149 current_power_state = self._get_power_state(instance) 

10150 compute_node = None 

10151 node_name = None 

10152 prev_host = instance.host 

10153 try: 

10154 compute_node = self._get_compute_info(context, self.host) 

10155 node_name = compute_node.hypervisor_hostname 

10156 except exception.ComputeHostNotFound: 

10157 LOG.exception('Failed to get compute_info for %s', self.host) 

10158 finally: 

10159 # NOTE(artom) We need to apply the migration context here 

10160 # regardless of whether the driver's 

10161 # post_live_migration_at_destination succeeded or not: the 

10162 # instance is on the destination, potentially with a new NUMA 

10163 # topology and resource usage. We need to persist that. 

10164 # NOTE(artom) Apply followed by drop looks weird, but apply 

10165 # just saves the new fields while drop actually removes the 

10166 # migration context from the instance. 

10167 instance.apply_migration_context() 

10168 instance.drop_migration_context() 

10169 instance.host = self.host 

10170 instance.power_state = current_power_state 

10171 instance.task_state = None 

10172 instance.node = node_name 

10173 instance.compute_id = compute_node and compute_node.id or None 

10174 instance.progress = 0 

10175 instance.save(expected_task_state=task_states.MIGRATING) 

10176 

10177 # NOTE(tr3buchet): tear down networks on source host (nova-net) 

10178 # NOTE(mriedem): For neutron, this will delete any inactive source 

10179 # host port bindings. 

10180 try: 

10181 self.network_api.setup_networks_on_host(context, instance, 

10182 prev_host, teardown=True) 

10183 except exception.PortBindingDeletionFailed as e: 

10184 # Removing the inactive port bindings from the source host is not 

10185 # critical so just log an error but don't fail. 

10186 LOG.error('Network cleanup failed for source host %s during post ' 

10187 'live migration. You may need to manually clean up ' 

10188 'resources in the network service. Error: %s', 

10189 prev_host, str(e)) 

10190 # NOTE(vish): this is necessary to update dhcp for nova-network 

10191 # NOTE(mriedem): This is a no-op for neutron. 

10192 self.network_api.setup_networks_on_host(context, instance, self.host) 

10193 self._notify_about_instance_usage( 

10194 context, instance, "live_migration.post.dest.end", 

10195 network_info=network_info) 

10196 compute_utils.notify_about_instance_action(context, instance, 

10197 self.host, 

10198 action=fields.NotificationAction.LIVE_MIGRATION_POST_DEST, 

10199 phase=fields.NotificationPhase.END) 

10200 

10201 def _remove_remote_volume_connections(self, context, dest, bdms, instance): 

10202 """Rollback remote volume connections on the dest""" 

10203 for bdm in bdms: 

10204 try: 

10205 # remove the connection on the destination host 

10206 # NOTE(lyarwood): This actually calls the cinderv2 

10207 # os-terminate_connection API if required. 

10208 self.compute_rpcapi.remove_volume_connection( 

10209 context, instance, bdm.volume_id, dest) 

10210 except Exception: 

10211 LOG.warning("Ignoring exception while attempting " 

10212 "to rollback volume connections for " 

10213 "volume %s on host %s.", bdm.volume_id, 

10214 dest, instance=instance) 

10215 

10216 def _rollback_volume_bdms(self, context, bdms, original_bdms, instance): 

10217 """Rollback the connection_info and attachment_id for each bdm""" 

10218 original_bdms_by_volid = {bdm.volume_id: bdm for bdm in original_bdms 

10219 if bdm.is_volume} 

10220 for bdm in bdms: 

10221 try: 

10222 original_bdm = original_bdms_by_volid[bdm.volume_id] 

10223 # NOTE(lyarwood): Only delete the referenced attachment if it 

10224 # is different to the original in order to avoid accidentally 

10225 # removing the source host volume attachment after it has 

10226 # already been rolled back by a failure in pre_live_migration. 

10227 if (bdm.attachment_id and original_bdm.attachment_id and 

10228 bdm.attachment_id != original_bdm.attachment_id): 

10229 # NOTE(lyarwood): 3.44 cinder api flow. Delete the 

10230 # attachment used by the bdm and reset it to that of 

10231 # the original bdm. 

10232 self.volume_api.attachment_delete(context, 

10233 bdm.attachment_id) 

10234 bdm.attachment_id = original_bdm.attachment_id 

10235 # NOTE(lyarwood): Reset the connection_info to the original 

10236 bdm.connection_info = original_bdm.connection_info 

10237 bdm.save() 

10238 except cinder_exception.ClientException: 

10239 LOG.warning("Ignoring cinderclient exception when " 

10240 "attempting to delete attachment %s for volume " 

10241 "%s while rolling back volume bdms.", 

10242 bdm.attachment_id, bdm.volume_id, 

10243 instance=instance) 

10244 except Exception: 

10245 with excutils.save_and_reraise_exception(): 

10246 LOG.exception("Exception while attempting to rollback " 

10247 "BDM for volume %s.", bdm.volume_id, 

10248 instance=instance) 

10249 

10250 @wrap_exception() 

10251 @wrap_instance_fault 

10252 def _rollback_live_migration(self, context, instance, 

10253 dest, migrate_data=None, 

10254 migration_status='failed', 

10255 source_bdms=None, 

10256 pre_live_migration=False): 

10257 """Recovers Instance/volume state from migrating -> running. 

10258 

10259 :param context: security context 

10260 :param instance: nova.objects.instance.Instance object 

10261 :param dest: 

10262 This method is called from live migration src host. 

10263 This param specifies destination host. 

10264 :param migrate_data: 

10265 if not none, contains implementation specific data. 

10266 :param migration_status: 

10267 Contains the status we want to set for the migration object 

10268 :param source_bdms: BDMs prior to modification by the destination 

10269 compute host. Set by _do_live_migration and not 

10270 part of the callback interface, so this is never 

10271 None 

10272 

10273 """ 

10274 # NOTE(gibi): We need to refresh pci_requests of the instance as it 

10275 # might be changed by the conductor during scheduling based on the 

10276 # selected destination host. If the instance has SRIOV ports with 

10277 # resource request then the LiveMigrationTask._find_destination call 

10278 # updated the instance.pci_requests.requests[].spec with the SRIOV PF 

10279 # device name to be used on the destination host. As the migration is 

10280 # rolling back to the source host now we don't want to persist the 

10281 # destination host related changes in the DB. 

10282 instance.pci_requests = \ 

10283 objects.InstancePCIRequests.get_by_instance_uuid( 

10284 context, instance.uuid) 

10285 

10286 if (isinstance(migrate_data, migrate_data_obj.LiveMigrateData) and 

10287 migrate_data.obj_attr_is_set('migration')): 

10288 migration = migrate_data.migration 

10289 else: 

10290 migration = None 

10291 

10292 if migration: 

10293 # Remove allocations created in Placement for the dest node. 

10294 # If migration is None, the virt driver didn't pass it which is 

10295 # a bug. 

10296 self._revert_allocation(context, instance, migration) 

10297 else: 

10298 LOG.error('Unable to revert allocations during live migration ' 

10299 'rollback; compute driver did not provide migrate_data', 

10300 instance=instance) 

10301 

10302 # NOTE(tr3buchet): setup networks on source host (really it's re-setup 

10303 # for nova-network) 

10304 # NOTE(mriedem): This is a no-op for neutron. 

10305 self.network_api.setup_networks_on_host(context, instance, self.host) 

10306 

10307 # NOTE(erlon): We should make sure that rollback_live_migration_at_src 

10308 # is not called in the pre_live_migration rollback as that will trigger 

10309 # the src host to re-attach interfaces which were not detached 

10310 # previously. 

10311 if not pre_live_migration: 

10312 self.driver.rollback_live_migration_at_source(context, instance, 

10313 migrate_data) 

10314 

10315 # NOTE(lyarwood): Fetch the current list of BDMs, disconnect any 

10316 # connected volumes from the dest and delete any volume attachments 

10317 # used by the destination host before rolling back to the original 

10318 # still valid source host volume attachments. 

10319 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

10320 context, instance.uuid) 

10321 # TODO(lyarwood): Turn the following into a lookup method within 

10322 # BlockDeviceMappingList. 

10323 vol_bdms = [bdm for bdm in bdms if bdm.is_volume] 

10324 self._remove_remote_volume_connections(context, dest, vol_bdms, 

10325 instance) 

10326 self._rollback_volume_bdms(context, vol_bdms, source_bdms, instance) 

10327 

10328 self._notify_about_instance_usage(context, instance, 

10329 "live_migration._rollback.start") 

10330 compute_utils.notify_about_instance_action(context, instance, 

10331 self.host, 

10332 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK, 

10333 phase=fields.NotificationPhase.START, 

10334 bdms=bdms) 

10335 

10336 do_cleanup, destroy_disks = self._live_migration_cleanup_flags( 

10337 migrate_data, migr_ctxt=instance.migration_context) 

10338 

10339 if do_cleanup: 

10340 self.compute_rpcapi.rollback_live_migration_at_destination( 

10341 context, instance, dest, destroy_disks=destroy_disks, 

10342 migrate_data=migrate_data) 

10343 else: 

10344 # The port binding profiles need to be cleaned up. 

10345 with errors_out_migration_ctxt(migration): 

10346 try: 

10347 # This call will delete any inactive destination host 

10348 # port bindings. 

10349 self.network_api.setup_networks_on_host( 

10350 context, instance, host=dest, teardown=True) 

10351 except exception.PortBindingDeletionFailed as e: 

10352 # Removing the inactive port bindings from the destination 

10353 # host is not critical so just log an error but don't fail. 

10354 LOG.error( 

10355 'Network cleanup failed for destination host %s ' 

10356 'during live migration rollback. You may need to ' 

10357 'manually clean up resources in the network service. ' 

10358 'Error: %s', dest, str(e)) 

10359 except Exception: 

10360 with excutils.save_and_reraise_exception(): 

10361 LOG.exception( 

10362 'An error occurred while cleaning up networking ' 

10363 'during live migration rollback.', 

10364 instance=instance) 

10365 

10366 # NOTE(luyao): We drop move_claim and migration_context after cleanup 

10367 # is complete, to ensure the specific resources claimed on destination 

10368 # are released safely. 

10369 # TODO(artom) drop_move_claim_at_destination() is new in RPC 5.3, only 

10370 # call it if we performed a NUMA-aware live migration (which implies us 

10371 # being able to send RPC 5.3). To check this, we can use the 

10372 # src_supports_numa_live_migration flag, as it will be set if and only 

10373 # if: 

10374 # - dst_supports_numa_live_migration made its way to the source 

10375 # (meaning both dest and source are new and conductor can speak 

10376 # RPC 5.3) 

10377 # - src_supports_numa_live_migration was set by the source driver and 

10378 # passed the send-RPC-5.3 check. 

10379 # This check can be removed in RPC 6.0. 

10380 if ('src_supports_numa_live_migration' in migrate_data and 

10381 migrate_data.src_supports_numa_live_migration): 

10382 LOG.debug('Calling destination to drop move claim.', 

10383 instance=instance) 

10384 self.compute_rpcapi.drop_move_claim_at_destination(context, 

10385 instance, dest) 

10386 

10387 # NOTE(luyao): We only update instance info after rollback operations 

10388 # are complete 

10389 instance.task_state = None 

10390 instance.progress = 0 

10391 instance.drop_migration_context() 

10392 instance.save(expected_task_state=[task_states.MIGRATING]) 

10393 

10394 self._notify_about_instance_usage(context, instance, 

10395 "live_migration._rollback.end") 

10396 compute_utils.notify_about_instance_action(context, instance, 

10397 self.host, 

10398 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK, 

10399 phase=fields.NotificationPhase.END, 

10400 bdms=bdms) 

10401 

10402 # NOTE(luyao): we have cleanup everything and get instance 

10403 # back to normal status, now set migration status to 'failed' 

10404 self._set_migration_status(migration, migration_status) 

10405 

10406 @wrap_exception() 

10407 @wrap_instance_fault 

10408 def drop_move_claim_at_destination(self, context, instance): 

10409 """Called by the source of a live migration during rollback to ask the 

10410 destination to drop the MoveClaim object that was created for the live 

10411 migration on the destination. 

10412 """ 

10413 nodename = self._get_nodename(instance) 

10414 LOG.debug('Dropping live migration resource claim on destination ' 

10415 'node %s', nodename, instance=instance) 

10416 self.rt.drop_move_claim( 

10417 context, instance, nodename, flavor=instance.flavor) 

10418 

10419 @wrap_exception() 

10420 @wrap_instance_event(prefix='compute') 

10421 @wrap_instance_fault 

10422 def rollback_live_migration_at_destination(self, context, instance, 

10423 destroy_disks, 

10424 migrate_data): 

10425 """Cleaning up image directory that is created pre_live_migration. 

10426 

10427 :param context: security context 

10428 :param instance: a nova.objects.instance.Instance object sent over rpc 

10429 :param destroy_disks: whether to destroy volumes or not 

10430 :param migrate_data: contains migration info 

10431 """ 

10432 network_info = self.network_api.get_instance_nw_info(context, instance) 

10433 self._notify_about_instance_usage( 

10434 context, instance, "live_migration.rollback.dest.start", 

10435 network_info=network_info) 

10436 compute_utils.notify_about_instance_action( 

10437 context, instance, self.host, 

10438 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST, 

10439 phase=fields.NotificationPhase.START) 

10440 try: 

10441 # NOTE(tr3buchet): tear down networks on dest host (nova-net) 

10442 # NOTE(mriedem): For neutron, this call will delete any 

10443 # destination host port bindings. 

10444 # TODO(mriedem): We should eventually remove this call from 

10445 # this method (rollback_live_migration_at_destination) since this 

10446 # method is only called conditionally based on whether or not the 

10447 # instance is running on shared storage. _rollback_live_migration 

10448 # already calls this method for neutron if we are running on 

10449 # shared storage. 

10450 self.network_api.setup_networks_on_host(context, instance, 

10451 self.host, teardown=True) 

10452 except exception.PortBindingDeletionFailed as e: 

10453 # Removing the inactive port bindings from the destination 

10454 # host is not critical so just log an error but don't fail. 

10455 LOG.error( 

10456 'Network cleanup failed for destination host %s ' 

10457 'during live migration rollback. You may need to ' 

10458 'manually clean up resources in the network service. ' 

10459 'Error: %s', self.host, str(e)) 

10460 except Exception: 

10461 with excutils.save_and_reraise_exception(): 

10462 # NOTE(tdurakov): even if teardown networks fails driver 

10463 # should try to rollback live migration on destination. 

10464 LOG.exception('An error occurred while deallocating network.', 

10465 instance=instance) 

10466 finally: 

10467 # always run this even if setup_networks_on_host fails 

10468 # NOTE(vish): The mapping is passed in so the driver can disconnect 

10469 # from remote volumes if necessary 

10470 block_device_info = self._get_instance_block_device_info(context, 

10471 instance) 

10472 # free any instance PCI claims done on destination during 

10473 # check_can_live_migrate_destination() 

10474 self.rt.free_pci_device_claims_for_instance(context, instance) 

10475 

10476 # NOTE(luyao): Apply migration_context temporarily since it's 

10477 # on destination host, we rely on instance object to cleanup 

10478 # specific resources like vpmem 

10479 with instance.mutated_migration_context(): 

10480 self.driver.rollback_live_migration_at_destination( 

10481 context, instance, network_info, block_device_info, 

10482 destroy_disks=destroy_disks, migrate_data=migrate_data) 

10483 

10484 self._notify_about_instance_usage( 

10485 context, instance, "live_migration.rollback.dest.end", 

10486 network_info=network_info) 

10487 compute_utils.notify_about_instance_action( 

10488 context, instance, self.host, 

10489 action=fields.NotificationAction.LIVE_MIGRATION_ROLLBACK_DEST, 

10490 phase=fields.NotificationPhase.END) 

10491 

10492 def _require_nw_info_update(self, context, instance): 

10493 """Detect whether there is a mismatch in binding:host_id, or 

10494 binding_failed or unbound binding:vif_type for any of the instances 

10495 ports. 

10496 """ 

10497 # Only update port bindings if compute manager does manage port 

10498 # bindings instead of the compute driver. For example IronicDriver 

10499 # manages the port binding for baremetal instance ports, hence, 

10500 # external intervention with the binding is not desired. 

10501 if self.driver.manages_network_binding_host_id(): 

10502 return False 

10503 

10504 search_opts = {'device_id': instance.uuid, 

10505 'fields': ['binding:host_id', 'binding:vif_type']} 

10506 ports = self.network_api.list_ports(context, **search_opts) 

10507 for p in ports['ports']: 

10508 if p.get('binding:host_id') != self.host: 

10509 return True 

10510 vif_type = p.get('binding:vif_type') 

10511 if (vif_type == network_model.VIF_TYPE_UNBOUND or 

10512 vif_type == network_model.VIF_TYPE_BINDING_FAILED): 

10513 return True 

10514 return False 

10515 

10516 @periodic_task.periodic_task( 

10517 spacing= CONF.heal_instance_info_cache_interval 

10518 if CONF.heal_instance_info_cache_interval != 0 

10519 else -1) 

10520 def _heal_instance_info_cache(self, context): 

10521 """Called periodically. On every call, try to update the 

10522 info_cache's network information for another instance by 

10523 calling to the network manager. 

10524 

10525 This is implemented by keeping a cache of uuids of instances 

10526 that live on this host. On each call, we pop one off of a 

10527 list, pull the DB record, and try the call to the network API. 

10528 If anything errors don't fail, as it's possible the instance 

10529 has been deleted, etc. 

10530 """ 

10531 instance_uuids = getattr(self, '_instance_uuids_to_heal', []) 

10532 instance = None 

10533 

10534 LOG.debug('Starting heal instance info cache') 

10535 

10536 if not instance_uuids: 

10537 # The list of instances to heal is empty so rebuild it 

10538 LOG.debug('Rebuilding the list of instances to heal') 

10539 db_instances = objects.InstanceList.get_by_host( 

10540 context, self.host, expected_attrs=[], use_slave=True) 

10541 for inst in db_instances: 

10542 # We don't want to refresh the cache for instances 

10543 # which are building or deleting so don't put them 

10544 # in the list. If they are building they will get 

10545 # added to the list next time we build it. 

10546 if (inst.vm_state == vm_states.BUILDING): 

10547 LOG.debug('Skipping network cache update for instance ' 

10548 'because it is Building.', instance=inst) 

10549 continue 

10550 if (inst.task_state == task_states.DELETING): 

10551 LOG.debug('Skipping network cache update for instance ' 

10552 'because it is being deleted.', instance=inst) 

10553 continue 

10554 

10555 if not instance: 

10556 # Save the first one we find so we don't 

10557 # have to get it again 

10558 instance = inst 

10559 else: 

10560 instance_uuids.append(inst['uuid']) 

10561 

10562 self._instance_uuids_to_heal = instance_uuids 

10563 else: 

10564 # Find the next valid instance on the list 

10565 while instance_uuids: 10565 ↛ 10589line 10565 didn't jump to line 10589 because the condition on line 10565 was always true

10566 try: 

10567 inst = objects.Instance.get_by_uuid( 

10568 context, instance_uuids.pop(0), 

10569 expected_attrs=['system_metadata', 'info_cache', 

10570 'flavor'], 

10571 use_slave=True) 

10572 except exception.InstanceNotFound: 

10573 # Instance is gone. Try to grab another. 

10574 continue 

10575 

10576 # Check the instance hasn't been migrated 

10577 if inst.host != self.host: 

10578 LOG.debug('Skipping network cache update for instance ' 

10579 'because it has been migrated to another ' 

10580 'host.', instance=inst) 

10581 # Check the instance isn't being deleting 

10582 elif inst.task_state == task_states.DELETING: 

10583 LOG.debug('Skipping network cache update for instance ' 

10584 'because it is being deleted.', instance=inst) 

10585 else: 

10586 instance = inst 

10587 break 

10588 

10589 if instance: 

10590 # We have an instance now to refresh 

10591 try: 

10592 # Fix potential mismatch in port binding if evacuation failed 

10593 # after reassigning the port binding to the dest host but 

10594 # before the instance host is changed. 

10595 # Do this only when instance has no pending task. 

10596 if instance.task_state is None and \ 

10597 self._require_nw_info_update(context, instance): 

10598 LOG.info("Updating ports in neutron", instance=instance) 

10599 self.network_api.setup_instance_network_on_host( 

10600 context, instance, self.host) 

10601 # Call to network API to get instance info.. this will 

10602 # force an update to the instance's info_cache 

10603 self.network_api.get_instance_nw_info( 

10604 context, instance, force_refresh=True) 

10605 LOG.debug('Updated the network info_cache for instance', 

10606 instance=instance) 

10607 except exception.InstanceNotFound: 

10608 # Instance is gone. 

10609 LOG.debug('Instance no longer exists. Unable to refresh', 

10610 instance=instance) 

10611 return 

10612 except exception.InstanceInfoCacheNotFound: 

10613 # InstanceInfoCache is gone. 

10614 LOG.debug('InstanceInfoCache no longer exists. ' 

10615 'Unable to refresh', instance=instance) 

10616 except Exception: 

10617 LOG.error('An error occurred while refreshing the network ' 

10618 'cache.', instance=instance, exc_info=True) 

10619 else: 

10620 LOG.debug("Didn't find any instances for network info cache " 

10621 "update.") 

10622 

10623 @periodic_task.periodic_task 

10624 def _poll_rebooting_instances(self, context): 

10625 if CONF.reboot_timeout > 0: 

10626 filters = {'task_state': 

10627 [task_states.REBOOTING, 

10628 task_states.REBOOT_STARTED, 

10629 task_states.REBOOT_PENDING], 

10630 'host': self.host} 

10631 rebooting = objects.InstanceList.get_by_filters( 

10632 context, filters, expected_attrs=[], use_slave=True) 

10633 

10634 to_poll = [] 

10635 for instance in rebooting: 

10636 if timeutils.is_older_than(instance.updated_at, 10636 ↛ 10635line 10636 didn't jump to line 10635 because the condition on line 10636 was always true

10637 CONF.reboot_timeout): 

10638 to_poll.append(instance) 

10639 

10640 self.driver.poll_rebooting_instances(CONF.reboot_timeout, to_poll) 

10641 

10642 @periodic_task.periodic_task 

10643 def _poll_rescued_instances(self, context): 

10644 if CONF.rescue_timeout > 0: 

10645 filters = {'vm_state': vm_states.RESCUED, 

10646 'host': self.host} 

10647 rescued_instances = objects.InstanceList.get_by_filters( 

10648 context, filters, expected_attrs=["system_metadata"], 

10649 use_slave=True) 

10650 

10651 to_unrescue = [] 

10652 for instance in rescued_instances: 

10653 if timeutils.is_older_than(instance.launched_at, 

10654 CONF.rescue_timeout): 

10655 to_unrescue.append(instance) 

10656 

10657 for instance in to_unrescue: 

10658 self.compute_api.unrescue(context, instance) 

10659 

10660 @periodic_task.periodic_task 

10661 def _poll_unconfirmed_resizes(self, context): 

10662 if CONF.resize_confirm_window == 0: 

10663 return 

10664 

10665 migrations = objects.MigrationList.get_unconfirmed_by_dest_compute( 

10666 context, CONF.resize_confirm_window, self.host, 

10667 use_slave=True) 

10668 

10669 migrations_info = dict(migration_count=len(migrations), 

10670 confirm_window=CONF.resize_confirm_window) 

10671 

10672 if migrations_info["migration_count"] > 0: 10672 ↛ 10677line 10672 didn't jump to line 10677 because the condition on line 10672 was always true

10673 LOG.info("Found %(migration_count)d unconfirmed migrations " 

10674 "older than %(confirm_window)d seconds", 

10675 migrations_info) 

10676 

10677 def _set_migration_to_error(migration, reason, **kwargs): 

10678 LOG.warning("Setting migration %(migration_id)s to error: " 

10679 "%(reason)s", 

10680 {'migration_id': migration.id, 'reason': reason}, 

10681 **kwargs) 

10682 migration.status = 'error' 

10683 migration.save() 

10684 

10685 for migration in migrations: 

10686 instance_uuid = migration.instance_uuid 

10687 LOG.info("Automatically confirming migration " 

10688 "%(migration_id)s for instance %(instance_uuid)s", 

10689 {'migration_id': migration.id, 

10690 'instance_uuid': instance_uuid}) 

10691 expected_attrs = ['metadata', 'system_metadata'] 

10692 try: 

10693 instance = objects.Instance.get_by_uuid(context, 

10694 instance_uuid, expected_attrs=expected_attrs, 

10695 use_slave=True) 

10696 except exception.InstanceNotFound: 

10697 reason = (_("Instance %s not found") % 

10698 instance_uuid) 

10699 _set_migration_to_error(migration, reason) 

10700 continue 

10701 if instance.vm_state == vm_states.ERROR: 

10702 reason = _("In ERROR state") 

10703 _set_migration_to_error(migration, reason, 

10704 instance=instance) 

10705 continue 

10706 # race condition: The instance in DELETING state should not be 

10707 # set the migration state to error, otherwise the instance in 

10708 # to be deleted which is in RESIZED state 

10709 # will not be able to confirm resize 

10710 if instance.task_state in [task_states.DELETING, 

10711 task_states.SOFT_DELETING]: 

10712 msg = ("Instance being deleted or soft deleted during resize " 

10713 "confirmation. Skipping.") 

10714 LOG.debug(msg, instance=instance) 

10715 continue 

10716 

10717 # race condition: This condition is hit when this method is 

10718 # called between the save of the migration record with a status of 

10719 # finished and the save of the instance object with a state of 

10720 # RESIZED. The migration record should not be set to error. 

10721 if instance.task_state == task_states.RESIZE_FINISH: 

10722 msg = ("Instance still resizing during resize " 

10723 "confirmation. Skipping.") 

10724 LOG.debug(msg, instance=instance) 

10725 continue 

10726 

10727 vm_state = instance.vm_state 

10728 task_state = instance.task_state 

10729 if vm_state != vm_states.RESIZED or task_state is not None: 

10730 reason = (_("In states %(vm_state)s/%(task_state)s, not " 

10731 "RESIZED/None") % 

10732 {'vm_state': vm_state, 

10733 'task_state': task_state}) 

10734 _set_migration_to_error(migration, reason, 

10735 instance=instance) 

10736 continue 

10737 try: 

10738 self.compute_api.confirm_resize(context, instance, 

10739 migration=migration) 

10740 except Exception as e: 

10741 LOG.info("Error auto-confirming resize: %s. " 

10742 "Will retry later.", e, instance=instance) 

10743 

10744 @periodic_task.periodic_task(spacing=CONF.shelved_poll_interval) 

10745 def _poll_shelved_instances(self, context): 

10746 

10747 if CONF.shelved_offload_time <= 0: 

10748 return 

10749 

10750 filters = {'vm_state': vm_states.SHELVED, 

10751 'task_state': None, 

10752 'host': self.host} 

10753 shelved_instances = objects.InstanceList.get_by_filters( 

10754 context, filters=filters, expected_attrs=['system_metadata'], 

10755 use_slave=True) 

10756 

10757 to_gc = [] 

10758 for instance in shelved_instances: 

10759 sys_meta = instance.system_metadata 

10760 shelved_at = timeutils.parse_strtime(sys_meta['shelved_at']) 

10761 if timeutils.is_older_than(shelved_at, CONF.shelved_offload_time): 

10762 to_gc.append(instance) 

10763 

10764 cyclient = cyborg.get_client(context) 

10765 for instance in to_gc: 

10766 try: 

10767 instance.task_state = task_states.SHELVING_OFFLOADING 

10768 instance.save(expected_task_state=(None,)) 

10769 accel_uuids = [] 

10770 if instance.flavor.extra_specs.get('accel:device_profile'): 

10771 # TODO(brinzhang): After cyborg support batch query ARQs 

10772 # for more than one instances, we will improve efficiency 

10773 # with this implementation. 

10774 accel_uuids = cyclient.get_arq_uuids_for_instance(instance) 

10775 self.shelve_offload_instance( 

10776 context, instance, clean_shutdown=False, 

10777 accel_uuids=accel_uuids) 

10778 except Exception: 

10779 LOG.exception('Periodic task failed to offload instance.', 

10780 instance=instance) 

10781 

10782 @periodic_task.periodic_task 

10783 def _instance_usage_audit(self, context): 

10784 if not CONF.instance_usage_audit: 

10785 return 

10786 

10787 begin, end = utils.last_completed_audit_period() 

10788 if objects.TaskLog.get(context, 'instance_usage_audit', begin, end, 10788 ↛ 10790line 10788 didn't jump to line 10790 because the condition on line 10788 was never true

10789 self.host): 

10790 return 

10791 

10792 instances = objects.InstanceList.get_active_by_window_joined( 

10793 context, begin, end, host=self.host, 

10794 expected_attrs=['system_metadata', 'info_cache', 'metadata', 

10795 'flavor'], 

10796 use_slave=True) 

10797 num_instances = len(instances) 

10798 errors = 0 

10799 successes = 0 

10800 LOG.info("Running instance usage audit for host %(host)s " 

10801 "from %(begin_time)s to %(end_time)s. " 

10802 "%(number_instances)s instances.", 

10803 {'host': self.host, 

10804 'begin_time': begin, 

10805 'end_time': end, 

10806 'number_instances': num_instances}) 

10807 start_time = time.time() 

10808 task_log = objects.TaskLog(context) 

10809 task_log.task_name = 'instance_usage_audit' 

10810 task_log.period_beginning = begin 

10811 task_log.period_ending = end 

10812 task_log.host = self.host 

10813 task_log.task_items = num_instances 

10814 task_log.message = 'Instance usage audit started...' 

10815 task_log.begin_task() 

10816 for instance in instances: 

10817 try: 

10818 compute_utils.notify_usage_exists( 

10819 self.notifier, context, instance, self.host, 

10820 ignore_missing_network_data=False) 

10821 successes += 1 

10822 except Exception: 

10823 LOG.exception('Failed to generate usage ' 

10824 'audit for instance ' 

10825 'on host %s', self.host, 

10826 instance=instance) 

10827 errors += 1 

10828 task_log.errors = errors 

10829 task_log.message = ( 

10830 'Instance usage audit ran for host %s, %s instances in %s seconds.' 

10831 % (self.host, num_instances, time.time() - start_time)) 

10832 task_log.end_task() 

10833 

10834 def _get_host_volume_bdms(self, context, use_slave=False): 

10835 """Return all block device mappings on a compute host.""" 

10836 compute_host_bdms = [] 

10837 instances = objects.InstanceList.get_by_host(context, self.host, 

10838 use_slave=use_slave) 

10839 for instance in instances: 

10840 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

10841 context, instance.uuid, use_slave=use_slave) 

10842 instance_bdms = [bdm for bdm in bdms if bdm.is_volume] 

10843 compute_host_bdms.append(dict(instance=instance, 

10844 instance_bdms=instance_bdms)) 

10845 

10846 return compute_host_bdms 

10847 

10848 def _update_volume_usage_cache(self, context, vol_usages): 

10849 """Updates the volume usage cache table with a list of stats.""" 

10850 for usage in vol_usages: 

10851 # Allow switching of greenthreads between queries. 

10852 greenthread.sleep(0) 

10853 vol_usage = objects.VolumeUsage(context) 

10854 vol_usage.volume_id = usage['volume'] 

10855 vol_usage.instance_uuid = usage['instance'].uuid 

10856 vol_usage.project_id = usage['instance'].project_id 

10857 vol_usage.user_id = usage['instance'].user_id 

10858 vol_usage.availability_zone = usage['instance'].availability_zone 

10859 vol_usage.curr_reads = usage['rd_req'] 

10860 vol_usage.curr_read_bytes = usage['rd_bytes'] 

10861 vol_usage.curr_writes = usage['wr_req'] 

10862 vol_usage.curr_write_bytes = usage['wr_bytes'] 

10863 vol_usage.save() 

10864 self.notifier.info(context, 'volume.usage', vol_usage.to_dict()) 

10865 compute_utils.notify_about_volume_usage(context, vol_usage, 

10866 self.host) 

10867 

10868 @periodic_task.periodic_task(spacing=CONF.volume_usage_poll_interval) 

10869 def _poll_volume_usage(self, context): 

10870 if CONF.volume_usage_poll_interval == 0: 

10871 return 

10872 

10873 compute_host_bdms = self._get_host_volume_bdms(context, 

10874 use_slave=True) 

10875 if not compute_host_bdms: 

10876 return 

10877 

10878 LOG.debug("Updating volume usage cache") 

10879 try: 

10880 vol_usages = self.driver.get_all_volume_usage(context, 

10881 compute_host_bdms) 

10882 except NotImplementedError: 

10883 return 

10884 

10885 self._update_volume_usage_cache(context, vol_usages) 

10886 

10887 @periodic_task.periodic_task(spacing=CONF.sync_power_state_interval, 

10888 run_immediately=True) 

10889 def _sync_power_states(self, context): 

10890 """Align power states between the database and the hypervisor. 

10891 

10892 To sync power state data we make a DB call to get the number of 

10893 virtual machines known by the hypervisor and if the number matches the 

10894 number of virtual machines known by the database, we proceed in a lazy 

10895 loop, one database record at a time, checking if the hypervisor has the 

10896 same power state as is in the database. 

10897 """ 

10898 db_instances = objects.InstanceList.get_by_host(context, self.host, 

10899 expected_attrs=[], 

10900 use_slave=True) 

10901 

10902 try: 

10903 num_vm_instances = self.driver.get_num_instances() 

10904 except exception.VirtDriverNotReady as e: 

10905 # If the virt driver is not ready, like ironic-api not being up 

10906 # yet in the case of ironic, just log it and exit. 

10907 LOG.info('Skipping _sync_power_states periodic task due to: %s', e) 

10908 return 

10909 

10910 num_db_instances = len(db_instances) 

10911 

10912 if num_vm_instances != num_db_instances: 

10913 LOG.warning("While synchronizing instance power states, found " 

10914 "%(num_db_instances)s instances in the database " 

10915 "and %(num_vm_instances)s instances on the " 

10916 "hypervisor.", 

10917 {'num_db_instances': num_db_instances, 

10918 'num_vm_instances': num_vm_instances}) 

10919 

10920 def _sync(db_instance): 

10921 # NOTE(melwitt): This must be synchronized as we query state from 

10922 # two separate sources, the driver and the database. 

10923 # They are set (in stop_instance) and read, in sync. 

10924 @utils.synchronized(db_instance.uuid) 

10925 def query_driver_power_state_and_sync(): 

10926 self._query_driver_power_state_and_sync(context, db_instance) 

10927 

10928 try: 

10929 query_driver_power_state_and_sync() 

10930 except Exception: 

10931 LOG.exception("Periodic sync_power_state task had an " 

10932 "error while processing an instance.", 

10933 instance=db_instance) 

10934 

10935 self._syncs_in_progress.pop(db_instance.uuid) 

10936 

10937 for db_instance in db_instances: 

10938 # process syncs asynchronously - don't want instance locking to 

10939 # block entire periodic task thread 

10940 uuid = db_instance.uuid 

10941 if uuid in self._syncs_in_progress: 10941 ↛ 10942line 10941 didn't jump to line 10942 because the condition on line 10941 was never true

10942 LOG.debug('Sync already in progress for %s', uuid) 

10943 else: 

10944 LOG.debug('Triggering sync for uuid %s', uuid) 

10945 self._syncs_in_progress[uuid] = True 

10946 nova.utils.pass_context(self._sync_power_pool.spawn_n, 

10947 _sync, 

10948 db_instance) 

10949 

10950 def _query_driver_power_state_and_sync(self, context, db_instance): 

10951 if db_instance.task_state is not None: 

10952 LOG.info("During sync_power_state the instance has a " 

10953 "pending task (%(task)s). Skip.", 

10954 {'task': db_instance.task_state}, instance=db_instance) 

10955 return 

10956 # No pending tasks. Now try to figure out the real vm_power_state. 

10957 try: 

10958 vm_instance = self.driver.get_info(db_instance) 

10959 vm_power_state = vm_instance.state 

10960 except exception.InstanceNotFound: 

10961 vm_power_state = power_state.NOSTATE 

10962 # Note(maoy): the above get_info call might take a long time, 

10963 # for example, because of a broken libvirt driver. 

10964 try: 

10965 self._sync_instance_power_state(context, 

10966 db_instance, 

10967 vm_power_state, 

10968 use_slave=True) 

10969 except exception.InstanceNotFound: 

10970 # NOTE(hanlind): If the instance gets deleted during sync, 

10971 # silently ignore. 

10972 pass 

10973 

10974 def _stop_unexpected_shutdown_instance(self, context, vm_state, 

10975 db_instance, orig_db_power_state): 

10976 # this is an exceptional case; make sure our data is up 

10977 # to date before slamming through a power off 

10978 vm_instance = self.driver.get_info(db_instance, 

10979 use_cache=False) 

10980 vm_power_state = vm_instance.state 

10981 

10982 # if it still looks off, go ahead and call stop() 

10983 if vm_power_state in (power_state.SHUTDOWN, 10983 ↛ exitline 10983 didn't return from function '_stop_unexpected_shutdown_instance' because the condition on line 10983 was always true

10984 power_state.CRASHED): 

10985 

10986 LOG.warning("Instance shutdown by itself. Calling the " 

10987 "stop API. Current vm_state: %(vm_state)s, " 

10988 "current task_state: %(task_state)s, " 

10989 "original DB power_state: %(db_power_state)s, " 

10990 "current VM power_state: %(vm_power_state)s", 

10991 {'vm_state': vm_state, 

10992 'task_state': db_instance.task_state, 

10993 'db_power_state': orig_db_power_state, 

10994 'vm_power_state': vm_power_state}, 

10995 instance=db_instance) 

10996 try: 

10997 # Note(maoy): here we call the API instead of 

10998 # brutally updating the vm_state in the database 

10999 # to allow all the hooks and checks to be performed. 

11000 if db_instance.shutdown_terminate: 

11001 self.compute_api.delete(context, db_instance) 

11002 else: 

11003 self.compute_api.stop(context, db_instance) 

11004 except Exception: 

11005 # Note(maoy): there is no need to propagate the error 

11006 # because the same power_state will be retrieved next 

11007 # time and retried. 

11008 # For example, there might be another task scheduled. 

11009 LOG.exception("error during stop() in sync_power_state.", 

11010 instance=db_instance) 

11011 

11012 def _sync_instance_power_state(self, context, db_instance, vm_power_state, 

11013 use_slave=False): 

11014 """Align instance power state between the database and hypervisor. 

11015 

11016 If the instance is not found on the hypervisor, but is in the database, 

11017 then a stop() API will be called on the instance. 

11018 """ 

11019 

11020 # We re-query the DB to get the latest instance info to minimize 

11021 # (not eliminate) race condition. 

11022 db_instance.refresh(use_slave=use_slave) 

11023 db_power_state = db_instance.power_state 

11024 vm_state = db_instance.vm_state 

11025 

11026 if self.host != db_instance.host: 11026 ↛ 11034line 11026 didn't jump to line 11034 because the condition on line 11026 was never true

11027 # on the sending end of nova-compute _sync_power_state 

11028 # may have yielded to the greenthread performing a live 

11029 # migration; this in turn has changed the resident-host 

11030 # for the VM; However, the instance is still active, it 

11031 # is just in the process of migrating to another host. 

11032 # This implies that the compute source must relinquish 

11033 # control to the compute destination. 

11034 LOG.info("During the sync_power process the " 

11035 "instance has moved from " 

11036 "host %(src)s to host %(dst)s", 

11037 {'src': db_instance.host, 

11038 'dst': self.host}, 

11039 instance=db_instance) 

11040 return 

11041 elif db_instance.task_state is not None: 11041 ↛ 11047line 11041 didn't jump to line 11047 because the condition on line 11041 was never true

11042 # on the receiving end of nova-compute, it could happen 

11043 # that the DB instance already report the new resident 

11044 # but the actual VM has not showed up on the hypervisor 

11045 # yet. In this case, let's allow the loop to continue 

11046 # and run the state sync in a later round 

11047 LOG.info("During sync_power_state the instance has a " 

11048 "pending task (%(task)s). Skip.", 

11049 {'task': db_instance.task_state}, 

11050 instance=db_instance) 

11051 return 

11052 

11053 orig_db_power_state = db_power_state 

11054 if vm_power_state != db_power_state: 

11055 LOG.info('During _sync_instance_power_state the DB ' 

11056 'power_state (%(db_power_state)s) does not match ' 

11057 'the vm_power_state from the hypervisor ' 

11058 '(%(vm_power_state)s). Updating power_state in the ' 

11059 'DB to match the hypervisor.', 

11060 {'db_power_state': db_power_state, 

11061 'vm_power_state': vm_power_state}, 

11062 instance=db_instance) 

11063 # power_state is always updated from hypervisor to db 

11064 db_instance.power_state = vm_power_state 

11065 db_instance.save() 

11066 db_power_state = vm_power_state 

11067 

11068 # Note(maoy): Now resolve the discrepancy between vm_state and 

11069 # vm_power_state. We go through all possible vm_states. 

11070 if vm_state in (vm_states.BUILDING, 11070 ↛ 11076line 11070 didn't jump to line 11076 because the condition on line 11070 was never true

11071 vm_states.RESCUED, 

11072 vm_states.RESIZED, 

11073 vm_states.SUSPENDED, 

11074 vm_states.ERROR): 

11075 # TODO(maoy): we ignore these vm_state for now. 

11076 pass 

11077 elif vm_state == vm_states.ACTIVE: 

11078 # The only rational power state should be RUNNING 

11079 if vm_power_state in (power_state.SHUTDOWN, 

11080 power_state.CRASHED): 

11081 self._stop_unexpected_shutdown_instance( 

11082 context, vm_state, db_instance, orig_db_power_state) 

11083 elif vm_power_state == power_state.SUSPENDED: 

11084 LOG.warning("Instance is suspended unexpectedly. Calling " 

11085 "the stop API.", instance=db_instance) 

11086 try: 

11087 self.compute_api.stop(context, db_instance) 

11088 except Exception: 

11089 LOG.exception("error during stop() in sync_power_state.", 

11090 instance=db_instance) 

11091 elif vm_power_state == power_state.PAUSED: 

11092 # Note(maoy): a VM may get into the paused state not only 

11093 # because the user request via API calls, but also 

11094 # due to (temporary) external instrumentations. 

11095 # Before the virt layer can reliably report the reason, 

11096 # we simply ignore the state discrepancy. In many cases, 

11097 # the VM state will go back to running after the external 

11098 # instrumentation is done. See bug 1097806 for details. 

11099 LOG.warning("Instance is paused unexpectedly. Ignore.", 

11100 instance=db_instance) 

11101 elif vm_power_state == power_state.NOSTATE: 

11102 # Occasionally, depending on the status of the hypervisor, 

11103 # which could be restarting for example, an instance may 

11104 # not be found. Therefore just log the condition. 

11105 LOG.warning("Instance is unexpectedly not found. Ignore.", 

11106 instance=db_instance) 

11107 elif vm_state == vm_states.STOPPED: 

11108 if vm_power_state not in (power_state.NOSTATE, 11108 ↛ exitline 11108 didn't return from function '_sync_instance_power_state' because the condition on line 11108 was always true

11109 power_state.SHUTDOWN, 

11110 power_state.CRASHED): 

11111 LOG.warning("Instance is not stopped. Calling " 

11112 "the stop API. Current vm_state: %(vm_state)s," 

11113 " current task_state: %(task_state)s, " 

11114 "original DB power_state: %(db_power_state)s, " 

11115 "current VM power_state: %(vm_power_state)s", 

11116 {'vm_state': vm_state, 

11117 'task_state': db_instance.task_state, 

11118 'db_power_state': orig_db_power_state, 

11119 'vm_power_state': vm_power_state}, 

11120 instance=db_instance) 

11121 try: 

11122 # NOTE(russellb) Force the stop, because normally the 

11123 # compute API would not allow an attempt to stop a stopped 

11124 # instance. 

11125 self.compute_api.force_stop(context, db_instance) 

11126 except Exception: 

11127 LOG.exception("error during stop() in sync_power_state.", 

11128 instance=db_instance) 

11129 elif vm_state == vm_states.PAUSED: 

11130 if vm_power_state in (power_state.SHUTDOWN, 11130 ↛ exitline 11130 didn't return from function '_sync_instance_power_state' because the condition on line 11130 was always true

11131 power_state.CRASHED): 

11132 LOG.warning("Paused instance shutdown by itself. Calling " 

11133 "the stop API.", instance=db_instance) 

11134 try: 

11135 self.compute_api.force_stop(context, db_instance) 

11136 except Exception: 

11137 LOG.exception("error during stop() in sync_power_state.", 

11138 instance=db_instance) 

11139 elif vm_state in (vm_states.SOFT_DELETED, 11139 ↛ exitline 11139 didn't return from function '_sync_instance_power_state' because the condition on line 11139 was always true

11140 vm_states.DELETED): 

11141 if vm_power_state not in (power_state.NOSTATE, 11141 ↛ 11145line 11141 didn't jump to line 11145 because the condition on line 11141 was never true

11142 power_state.SHUTDOWN): 

11143 # Note(maoy): this should be taken care of periodically in 

11144 # _cleanup_running_deleted_instances(). 

11145 LOG.warning("Instance is not (soft-)deleted.", 

11146 instance=db_instance) 

11147 

11148 @periodic_task.periodic_task 

11149 def _reclaim_queued_deletes(self, context): 

11150 """Reclaim instances that are queued for deletion.""" 

11151 interval = CONF.reclaim_instance_interval 

11152 if interval <= 0: 

11153 LOG.debug("CONF.reclaim_instance_interval <= 0, skipping...") 

11154 return 

11155 

11156 filters = {'vm_state': vm_states.SOFT_DELETED, 

11157 'task_state': None, 

11158 'host': self.host} 

11159 instances = objects.InstanceList.get_by_filters( 

11160 context, filters, 

11161 expected_attrs=objects.instance.INSTANCE_DEFAULT_FIELDS, 

11162 use_slave=True) 

11163 for instance in instances: 

11164 if self._deleted_old_enough(instance, interval): 

11165 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

11166 context, instance.uuid) 

11167 LOG.info('Reclaiming deleted instance', instance=instance) 

11168 try: 

11169 self._delete_instance(context, instance, bdms) 

11170 except Exception as e: 

11171 LOG.warning("Periodic reclaim failed to delete " 

11172 "instance: %s", 

11173 e, instance=instance) 

11174 

11175 def _get_nodename(self, instance, refresh=False): 

11176 """Helper method to get the name of the first available node 

11177 on this host. This method should not be used with any operations 

11178 on ironic instances since it does not handle multiple nodes. 

11179 """ 

11180 node = self.driver.get_available_nodes(refresh=refresh)[0] 

11181 LOG.debug("No node specified, defaulting to %s", node, 

11182 instance=instance) 

11183 return node 

11184 

11185 def _update_available_resource_for_node(self, context, nodename, 

11186 startup=False): 

11187 

11188 try: 

11189 self.rt.update_available_resource(context, nodename, 

11190 startup=startup) 

11191 except exception.ComputeHostNotFound: 

11192 LOG.warning("Compute node '%s' not found in " 

11193 "update_available_resource.", nodename) 

11194 except exception.ReshapeFailed: 

11195 # We're only supposed to get here on startup, if a reshape was 

11196 # needed, was attempted, and failed. We want to kill the service. 

11197 with excutils.save_and_reraise_exception(): 

11198 LOG.critical("Resource provider data migration failed " 

11199 "fatally during startup for node %s.", nodename) 

11200 except exception.ReshapeNeeded: 

11201 # This exception should only find its way here if the virt driver's 

11202 # update_provider_tree raised it incorrectly: either 

11203 # a) After the resource tracker already caught it once and 

11204 # reinvoked update_provider_tree with allocations. At this point 

11205 # the driver is just supposed to *do* the reshape, so if it raises 

11206 # ReshapeNeeded, it's a bug, and we want to kill the compute 

11207 # service. 

11208 # b) On periodic rather than startup (we only allow reshapes to 

11209 # happen on startup). In this case we'll just make the logs red and 

11210 # go again at the next periodic interval, where the same thing may 

11211 # or may not happen again. Depending on the previous and intended 

11212 # shape of the providers/inventories, this may not actually cause 

11213 # any immediately visible symptoms (in terms of scheduling, etc.) 

11214 # If this becomes a problem, we may wish to make it pop immediately 

11215 # (e.g. disable the service). 

11216 with excutils.save_and_reraise_exception(): 

11217 LOG.exception("ReshapeNeeded exception is unexpected here!") 

11218 except exception.PlacementPciException: 

11219 # If we are at startup and the Placement PCI inventory handling 

11220 # failed then probably there is a configuration error. Propagate 

11221 # the error up to kill the service. 

11222 if startup: 

11223 raise 

11224 # If we are not at startup then we can assume that the 

11225 # configuration was correct at startup so the error is probably 

11226 # transient. Anyhow we cannot kill the service any more so just 

11227 # log the error and continue. 

11228 LOG.exception( 

11229 "Error updating PCI resources for node %(node)s.", 

11230 {'node': nodename}) 

11231 except exception.InvalidConfiguration as e: 

11232 if startup: 

11233 # If this happens during startup, we need to let it raise to 

11234 # abort our service startup. 

11235 raise 

11236 else: 

11237 LOG.error("Error updating resources for node %s: %s", 

11238 nodename, e) 

11239 except Exception: 

11240 LOG.exception("Error updating resources for node %(node)s.", 

11241 {'node': nodename}) 

11242 

11243 @periodic_task.periodic_task(spacing=CONF.update_resources_interval) 

11244 def update_available_resource(self, context, startup=False): 

11245 """See driver.get_available_resource() 

11246 

11247 Periodic process that keeps that the compute host's understanding of 

11248 resource availability and usage in sync with the underlying hypervisor. 

11249 

11250 :param context: security context 

11251 :param startup: True if this is being called when the nova-compute 

11252 service is starting, False otherwise. 

11253 """ 

11254 try: 

11255 nodenames = set(self.driver.get_available_nodes()) 

11256 except exception.VirtDriverNotReady: 

11257 LOG.warning("Virt driver is not ready.") 

11258 return 

11259 

11260 compute_nodes_in_db = self._get_compute_nodes_in_db(context, 

11261 nodenames, 

11262 use_slave=True, 

11263 startup=startup) 

11264 

11265 self.rt.clean_compute_node_cache(compute_nodes_in_db) 

11266 

11267 # Delete orphan compute node not reported by driver but still in db 

11268 for cn in compute_nodes_in_db: 

11269 if cn.hypervisor_hostname not in nodenames: 

11270 # if the node could be migrated, we don't delete 

11271 # the compute node database records 

11272 if not self.driver.is_node_deleted(cn.hypervisor_hostname): 

11273 LOG.warning( 

11274 "Found orphan compute node %(id)s " 

11275 "hypervisor host is %(hh)s, " 

11276 "nodes are %(nodes)s. " 

11277 "We are not deleting this as the driver " 

11278 "says this node has not been deleted.", 

11279 {'id': cn.id, 'hh': cn.hypervisor_hostname, 

11280 'nodes': nodenames}) 

11281 continue 

11282 

11283 LOG.info("Deleting orphan compute node %(id)s " 

11284 "hypervisor host is %(hh)s, " 

11285 "nodes are %(nodes)s", 

11286 {'id': cn.id, 'hh': cn.hypervisor_hostname, 

11287 'nodes': nodenames}) 

11288 try: 

11289 cn.destroy() 

11290 except exception.ObjectActionError: 

11291 # NOTE(mgoddard): it's possible that another compute 

11292 # service took ownership of this compute node since we 

11293 # queried it due to a rebalance, and this will cause the 

11294 # deletion to fail. Ignore the error in that case. 

11295 LOG.info("Ignoring failure to delete orphan compute node " 

11296 "%(id)s on hypervisor host %(hh)s due to " 

11297 "possible node rebalance", 

11298 {'id': cn.id, 'hh': cn.hypervisor_hostname}) 

11299 self.rt.remove_node(cn.hypervisor_hostname) 

11300 self.reportclient.invalidate_resource_provider(cn.uuid) 

11301 else: 

11302 self.rt.remove_node(cn.hypervisor_hostname) 

11303 # Delete the corresponding resource provider in placement, 

11304 # along with any associated allocations. 

11305 try: 

11306 self.reportclient.delete_resource_provider( 

11307 context, cn, cascade=True) 

11308 except keystone_exception.ClientException as e: 

11309 LOG.error( 

11310 "Failed to delete compute node resource provider " 

11311 "for compute node %s: %s", cn.uuid, str(e)) 

11312 

11313 for nodename in nodenames: 

11314 self._update_available_resource_for_node(context, nodename, 

11315 startup=startup) 

11316 

11317 def _get_compute_nodes_in_db(self, context, nodenames, use_slave=False, 

11318 startup=False): 

11319 try: 

11320 return objects.ComputeNodeList.get_all_by_host(context, self.host, 

11321 use_slave=use_slave) 

11322 except exception.NotFound: 

11323 # If the driver is not reporting any nodenames we should not 

11324 # expect there to be compute nodes so we just return in that case. 

11325 # For example, this could be an ironic compute and it is not 

11326 # managing any nodes yet. 

11327 if nodenames: 

11328 if startup: 11328 ↛ 11334line 11328 didn't jump to line 11334 because the condition on line 11328 was always true

11329 LOG.warning( 

11330 "No compute node record found for host %s. If this is " 

11331 "the first time this service is starting on this " 

11332 "host, then you can ignore this warning.", self.host) 

11333 else: 

11334 LOG.error("No compute node record for host %s", self.host) 

11335 return [] 

11336 

11337 @periodic_task.periodic_task( 

11338 spacing=CONF.running_deleted_instance_poll_interval, 

11339 run_immediately=True) 

11340 def _cleanup_running_deleted_instances(self, context): 

11341 """Cleanup any instances which are erroneously still running after 

11342 having been deleted. 

11343 

11344 Valid actions to take are: 

11345 

11346 1. noop - do nothing 

11347 2. log - log which instances are erroneously running 

11348 3. reap - shutdown and cleanup any erroneously running instances 

11349 4. shutdown - power off *and disable* any erroneously running 

11350 instances 

11351 

11352 The use-case for this cleanup task is: for various reasons, it may be 

11353 possible for the database to show an instance as deleted but for that 

11354 instance to still be running on a host machine (see bug 

11355 https://bugs.launchpad.net/nova/+bug/911366). 

11356 

11357 This cleanup task is a cross-hypervisor utility for finding these 

11358 zombied instances and either logging the discrepancy (likely what you 

11359 should do in production), or automatically reaping the instances (more 

11360 appropriate for dev environments). 

11361 """ 

11362 action = CONF.running_deleted_instance_action 

11363 

11364 if action == "noop": 11364 ↛ 11365line 11364 didn't jump to line 11365 because the condition on line 11364 was never true

11365 return 

11366 

11367 # NOTE(sirp): admin contexts don't ordinarily return deleted records 

11368 with utils.temporary_mutation(context, read_deleted="yes"): 

11369 

11370 try: 

11371 instances = self._running_deleted_instances(context) 

11372 except exception.VirtDriverNotReady: 

11373 # Since this task runs immediately on startup, if the 

11374 # hypervisor is not yet ready handle it gracefully. 

11375 LOG.debug('Unable to check for running deleted instances ' 

11376 'at this time since the hypervisor is not ready.') 

11377 return 

11378 

11379 for instance in instances: 

11380 if action == "log": 11380 ↛ 11381line 11380 didn't jump to line 11381 because the condition on line 11380 was never true

11381 LOG.warning("Detected instance with name label " 

11382 "'%s' which is marked as " 

11383 "DELETED but still present on host.", 

11384 instance.name, instance=instance) 

11385 

11386 elif action == 'shutdown': 

11387 LOG.info("Powering off instance with name label " 

11388 "'%s' which is marked as " 

11389 "DELETED but still present on host.", 

11390 instance.name, instance=instance) 

11391 try: 

11392 self.driver.power_off(context, instance) 

11393 except Exception: 

11394 LOG.warning("Failed to power off instance", 

11395 instance=instance, exc_info=True) 

11396 

11397 elif action == 'reap': 11397 ↛ 11415line 11397 didn't jump to line 11415 because the condition on line 11397 was always true

11398 LOG.info("Destroying instance with name label " 

11399 "'%s' which is marked as " 

11400 "DELETED but still present on host.", 

11401 instance.name, instance=instance) 

11402 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

11403 context, instance.uuid, use_slave=True) 

11404 self.instance_events.clear_events_for_instance(instance) 

11405 try: 

11406 self._shutdown_instance(context, instance, bdms, 

11407 notify=False) 

11408 self._cleanup_volumes(context, instance, bdms, 

11409 detach=False) 

11410 except Exception as e: 

11411 LOG.warning("Periodic cleanup failed to delete " 

11412 "instance: %s", 

11413 e, instance=instance) 

11414 else: 

11415 raise Exception(_("Unrecognized value '%s'" 

11416 " for CONF.running_deleted_" 

11417 "instance_action") % action) 

11418 

11419 def _running_deleted_instances(self, context): 

11420 """Returns a list of instances nova thinks is deleted, 

11421 but the hypervisor thinks is still running. 

11422 """ 

11423 timeout = CONF.running_deleted_instance_timeout 

11424 filters = {'deleted': True, 

11425 'soft_deleted': False} 

11426 instances = self._get_instances_on_driver(context, filters) 

11427 return [i for i in instances if self._deleted_old_enough(i, timeout)] 

11428 

11429 def _deleted_old_enough(self, instance, timeout): 

11430 deleted_at = instance.deleted_at 

11431 if deleted_at: 11431 ↛ 11433line 11431 didn't jump to line 11433 because the condition on line 11431 was always true

11432 deleted_at = deleted_at.replace(tzinfo=None) 

11433 return (not deleted_at or timeutils.is_older_than(deleted_at, timeout)) 

11434 

11435 @contextlib.contextmanager 

11436 def _error_out_instance_on_exception(self, context, instance, 

11437 instance_state=vm_states.ACTIVE): 

11438 """Context manager to set instance.vm_state after some operation raises 

11439 

11440 Used to handle NotImplementedError and InstanceFaultRollback errors 

11441 and reset the instance vm_state and task_state. The vm_state is set 

11442 to the $instance_state parameter and task_state is set to None. 

11443 For all other types of exceptions, the vm_state is set to ERROR and 

11444 the task_state is left unchanged (although most callers will have the 

11445 @reverts_task_state decorator which will set the task_state to None). 

11446 

11447 Re-raises the original exception *except* in the case of 

11448 InstanceFaultRollback in which case the wrapped `inner_exception` is 

11449 re-raised. 

11450 

11451 :param context: The nova auth request context for the operation. 

11452 :param instance: The instance to update. The vm_state will be set by 

11453 this context manager when an exception is raised. 

11454 :param instance_state: For NotImplementedError and 

11455 InstanceFaultRollback this is the vm_state to set the instance to 

11456 when handling one of those types of exceptions. By default the 

11457 instance will be set to ACTIVE, but the caller should control this 

11458 in case there have been no changes to the running state of the 

11459 instance. For example, resizing a stopped server where prep_resize 

11460 fails early and does not change the power state of the guest should 

11461 not set the instance status to ACTIVE but remain STOPPED. 

11462 This parameter is ignored for all other types of exceptions and the 

11463 instance vm_state is set to ERROR. 

11464 """ 

11465 # NOTE(mriedem): Why doesn't this method just save off the 

11466 # original instance.vm_state here rather than use a parameter? Or use 

11467 # instance_state=None as an override but default to the current 

11468 # vm_state when rolling back. 

11469 instance_uuid = instance.uuid 

11470 try: 

11471 yield 

11472 except (NotImplementedError, exception.InstanceFaultRollback) as error: 

11473 # Use reraise=False to determine if we want to raise the original 

11474 # exception or something else. 

11475 with excutils.save_and_reraise_exception(reraise=False) as ctxt: 

11476 LOG.info("Setting instance back to %(state)s after: %(error)s", 

11477 {'state': instance_state, 'error': error}, 

11478 instance_uuid=instance_uuid) 

11479 self._instance_update(context, instance, 

11480 vm_state=instance_state, 

11481 task_state=None) 

11482 if isinstance(error, exception.InstanceFaultRollback): 

11483 # Raise the wrapped exception. 

11484 raise error.inner_exception 

11485 # Else re-raise the NotImplementedError. 

11486 ctxt.reraise = True 

11487 except Exception: 

11488 LOG.exception('Setting instance vm_state to ERROR', 

11489 instance_uuid=instance_uuid) 

11490 with excutils.save_and_reraise_exception(): 

11491 # NOTE(mriedem): Why don't we pass clean_task_state=True here? 

11492 self._set_instance_obj_error_state(instance) 

11493 

11494 def _process_instance_event(self, instance, event): 

11495 _event = self.instance_events.pop_instance_event(instance, event) 

11496 if _event: 

11497 LOG.debug('Processing event %(event)s', 

11498 {'event': event.key}, instance=instance) 

11499 _event.send(event) 

11500 else: 

11501 # If it's a network-vif-unplugged event and the instance is being 

11502 # deleted or live migrated then we don't need to make this a 

11503 # warning as it's expected. There are other expected things which 

11504 # could trigger this event like detaching an interface, but we 

11505 # don't have a task state for that. 

11506 # TODO(mriedem): We have other move operations and things like 

11507 # hard reboot (probably rebuild as well) which trigger this event 

11508 # but nothing listens for network-vif-unplugged. We should either 

11509 # handle those other known cases or consider just not logging a 

11510 # warning if we get this event and the instance is undergoing some 

11511 # task state transition. 

11512 if (event.name == 'network-vif-unplugged' and 

11513 instance.task_state in ( 

11514 task_states.DELETING, task_states.MIGRATING)): 

11515 LOG.debug('Received event %s for instance with task_state %s.', 

11516 event.key, instance.task_state, instance=instance) 

11517 else: 

11518 LOG.warning('Received unexpected event %(event)s for ' 

11519 'instance with vm_state %(vm_state)s and ' 

11520 'task_state %(task_state)s.', 

11521 {'event': event.key, 

11522 'vm_state': instance.vm_state, 

11523 'task_state': instance.task_state}, 

11524 instance=instance) 

11525 

11526 def _process_instance_vif_deleted_event(self, context, instance, 

11527 deleted_vif_id): 

11528 # If an attached port is deleted by neutron, it needs to 

11529 # be detached from the instance. 

11530 # And info cache needs to be updated. 

11531 network_info = instance.info_cache.network_info 

11532 for index, vif in enumerate(network_info): 11532 ↛ exitline 11532 didn't return from function '_process_instance_vif_deleted_event' because the loop on line 11532 didn't complete

11533 if vif['id'] == deleted_vif_id: 

11534 LOG.info('Neutron deleted interface %(intf)s; ' 

11535 'detaching it from the instance and ' 

11536 'deleting it from the info cache', 

11537 {'intf': vif['id']}, 

11538 instance=instance) 

11539 profile = vif.get('profile', {}) or {} # profile can be None 

11540 rps = profile.get('allocation') 

11541 if rps: 11541 ↛ 11542line 11541 didn't jump to line 11542 because the condition on line 11541 was never true

11542 if isinstance(rps, dict): 

11543 # if extended resource request extension is enabled 

11544 # then we have a dict of providers, flatten it for the 

11545 # log. 

11546 rps = ','.join(rps.values()) 

11547 LOG.error( 

11548 'The bound port %(port_id)s is deleted in Neutron but ' 

11549 'the resource allocation on the resource providers ' 

11550 '%(rp_uuid)s are leaked until the server ' 

11551 '%(server_uuid)s is deleted.', 

11552 {'port_id': vif['id'], 

11553 'rp_uuid': rps, 

11554 'server_uuid': instance.uuid}) 

11555 

11556 del network_info[index] 

11557 neutron.update_instance_cache_with_nw_info( 

11558 self.network_api, context, instance, nw_info=network_info) 

11559 try: 

11560 self.driver.detach_interface(context, instance, vif) 

11561 except NotImplementedError: 

11562 # Not all virt drivers support attach/detach of interfaces 

11563 # yet (like Ironic), so just ignore this. 

11564 pass 

11565 except exception.NovaException as ex: 

11566 # If the instance was deleted before the interface was 

11567 # detached, just log it at debug. 

11568 log_level = (logging.DEBUG 

11569 if isinstance(ex, exception.InstanceNotFound) 

11570 else logging.WARNING) 

11571 LOG.log(log_level, 

11572 "Detach interface failed, " 

11573 "port_id=%(port_id)s, reason: %(msg)s", 

11574 {'port_id': deleted_vif_id, 'msg': ex}, 

11575 instance=instance) 

11576 break 

11577 

11578 @wrap_instance_event(prefix='compute') 

11579 @wrap_instance_fault 

11580 def extend_volume(self, context, instance, extended_volume_id): 

11581 

11582 # If an attached volume is extended by cinder, it needs to 

11583 # be extended by virt driver so host can detect its new size. 

11584 # And bdm needs to be updated. 

11585 LOG.debug('Handling volume-extended event for volume %(vol)s', 

11586 {'vol': extended_volume_id}, instance=instance) 

11587 

11588 try: 

11589 bdm = objects.BlockDeviceMapping.get_by_volume_and_instance( 

11590 context, extended_volume_id, instance.uuid) 

11591 except exception.NotFound: 

11592 LOG.warning('Extend volume failed, ' 

11593 'volume %(vol)s is not attached to instance.', 

11594 {'vol': extended_volume_id}, 

11595 instance=instance) 

11596 return 

11597 

11598 LOG.info('Cinder extended volume %(vol)s; ' 

11599 'extending it to detect new size', 

11600 {'vol': extended_volume_id}, 

11601 instance=instance) 

11602 volume = self.volume_api.get(context, bdm.volume_id) 

11603 

11604 if bdm.connection_info is None: 11604 ↛ 11605line 11604 didn't jump to line 11605 because the condition on line 11604 was never true

11605 LOG.warning('Extend volume failed, ' 

11606 'attached volume %(vol)s has no connection_info', 

11607 {'vol': extended_volume_id}, 

11608 instance=instance) 

11609 return 

11610 

11611 connection_info = jsonutils.loads(bdm.connection_info) 

11612 bdm.volume_size = volume['size'] 

11613 bdm.save() 

11614 

11615 if not self.driver.capabilities.get('supports_extend_volume', False): 

11616 raise exception.ExtendVolumeNotSupported() 

11617 

11618 try: 

11619 self.driver.extend_volume(context, connection_info, instance, 

11620 bdm.volume_size * units.Gi) 

11621 except Exception as ex: 

11622 LOG.warning('Extend volume failed, ' 

11623 'volume_id=%(volume_id)s, reason: %(msg)s', 

11624 {'volume_id': extended_volume_id, 'msg': ex}, 

11625 instance=instance) 

11626 raise 

11627 

11628 @staticmethod 

11629 def _is_state_valid_for_power_update_event(instance, target_power_state): 

11630 """Check if the current state of the instance allows it to be 

11631 a candidate for the power-update event. 

11632 

11633 :param instance: The nova instance object. 

11634 :param target_power_state: The desired target power state; this should 

11635 either be "POWER_ON" or "POWER_OFF". 

11636 :returns Boolean: True if the instance can be subjected to the 

11637 power-update event. 

11638 """ 

11639 if ((target_power_state == external_event_obj.POWER_ON and 

11640 instance.task_state is None and 

11641 instance.vm_state == vm_states.STOPPED and 

11642 instance.power_state == power_state.SHUTDOWN) or 

11643 (target_power_state == external_event_obj.POWER_OFF and 

11644 instance.task_state is None and 

11645 instance.vm_state == vm_states.ACTIVE and 

11646 instance.power_state == power_state.RUNNING)): 

11647 return True 

11648 return False 

11649 

11650 @wrap_exception() 

11651 @reverts_task_state 

11652 @wrap_instance_event(prefix='compute') 

11653 @wrap_instance_fault 

11654 def power_update(self, context, instance, target_power_state): 

11655 """Power update of an instance prompted by an external event. 

11656 :param context: The API request context. 

11657 :param instance: The nova instance object. 

11658 :param target_power_state: The desired target power state; 

11659 this should either be "POWER_ON" or 

11660 "POWER_OFF". 

11661 """ 

11662 

11663 @utils.synchronized(instance.uuid) 

11664 def do_power_update(): 

11665 LOG.debug('Handling power-update event with target_power_state %s ' 

11666 'for instance', target_power_state, instance=instance) 

11667 if not self._is_state_valid_for_power_update_event( 

11668 instance, target_power_state): 

11669 pow_state = fields.InstancePowerState.from_index( 

11670 instance.power_state) 

11671 LOG.info('The power-update %(tag)s event for instance ' 

11672 '%(uuid)s is a no-op since the instance is in ' 

11673 'vm_state %(vm_state)s, task_state ' 

11674 '%(task_state)s and power_state ' 

11675 '%(power_state)s.', 

11676 {'tag': target_power_state, 'uuid': instance.uuid, 

11677 'vm_state': instance.vm_state, 

11678 'task_state': instance.task_state, 

11679 'power_state': pow_state}) 

11680 return 

11681 LOG.debug("Trying to %s instance", 

11682 target_power_state, instance=instance) 

11683 if target_power_state == external_event_obj.POWER_ON: 

11684 action = fields.NotificationAction.POWER_ON 

11685 notification_name = "power_on." 

11686 instance.task_state = task_states.POWERING_ON 

11687 else: 

11688 # It's POWER_OFF 

11689 action = fields.NotificationAction.POWER_OFF 

11690 notification_name = "power_off." 

11691 instance.task_state = task_states.POWERING_OFF 

11692 instance.progress = 0 

11693 

11694 try: 

11695 # Note that the task_state is set here rather than the API 

11696 # because this is a best effort operation and deferring 

11697 # updating the task_state until we get to the compute service 

11698 # avoids error handling in the API and needing to account for 

11699 # older compute services during rolling upgrades from Stein. 

11700 # If we lose a race, UnexpectedTaskStateError is handled 

11701 # below. 

11702 instance.save(expected_task_state=[None]) 

11703 self._notify_about_instance_usage(context, instance, 

11704 notification_name + "start") 

11705 compute_utils.notify_about_instance_action(context, instance, 

11706 self.host, action=action, 

11707 phase=fields.NotificationPhase.START) 

11708 # UnexpectedTaskStateError raised from the driver will be 

11709 # handled below and not result in a fault, error notification 

11710 # or failure of the instance action. Other driver errors like 

11711 # NotImplementedError will be record a fault, send an error 

11712 # notification and mark the instance action as failed. 

11713 self.driver.power_update_event(instance, target_power_state) 

11714 self._notify_about_instance_usage(context, instance, 

11715 notification_name + "end") 

11716 compute_utils.notify_about_instance_action(context, instance, 

11717 self.host, action=action, 

11718 phase=fields.NotificationPhase.END) 

11719 except exception.UnexpectedTaskStateError as e: 

11720 # Handling the power-update event is best effort and if we lost 

11721 # a race with some other action happening to the instance we 

11722 # just log it and return rather than fail the action. 

11723 LOG.info("The power-update event was possibly preempted: %s ", 

11724 e.format_message(), instance=instance) 

11725 return 

11726 do_power_update() 

11727 

11728 @wrap_exception() 

11729 def external_instance_event(self, context, instances, events): 

11730 # NOTE(danms): Some event types are handled by the manager, such 

11731 # as when we're asked to update the instance's info_cache. If it's 

11732 # not one of those, look for some thread(s) waiting for the event and 

11733 # unblock them if so. 

11734 for event in events: 

11735 instance = [inst for inst in instances 

11736 if inst.uuid == event.instance_uuid][0] 

11737 LOG.debug('Received event %(event)s', 

11738 {'event': event.key}, 

11739 instance=instance) 

11740 if event.name == 'network-changed': 

11741 try: 

11742 LOG.debug('Refreshing instance network info cache due to ' 

11743 'event %s.', event.key, instance=instance) 

11744 self.network_api.get_instance_nw_info( 

11745 context, instance, refresh_vif_id=event.tag) 

11746 except exception.NotFound as e: 

11747 LOG.info('Failed to process external instance event ' 

11748 '%(event)s due to: %(error)s', 

11749 {'event': event.key, 'error': str(e)}, 

11750 instance=instance) 

11751 elif event.name == 'network-vif-deleted': 

11752 try: 

11753 self._process_instance_vif_deleted_event(context, 

11754 instance, 

11755 event.tag) 

11756 except exception.NotFound as e: 

11757 LOG.info('Failed to process external instance event ' 

11758 '%(event)s due to: %(error)s', 

11759 {'event': event.key, 'error': str(e)}, 

11760 instance=instance) 

11761 elif event.name == 'volume-extended': 

11762 self.extend_volume(context, instance, event.tag) 

11763 elif event.name == 'power-update': 

11764 self.power_update(context, instance, event.tag) 

11765 else: 

11766 self._process_instance_event(instance, event) 

11767 

11768 @periodic_task.periodic_task(spacing=CONF.image_cache.manager_interval, 

11769 external_process_ok=True) 

11770 def _run_image_cache_manager_pass(self, context): 

11771 """Run a single pass of the image cache manager.""" 

11772 

11773 if not self.driver.capabilities.get("has_imagecache", False): 11773 ↛ 11774line 11773 didn't jump to line 11774 because the condition on line 11773 was never true

11774 return 

11775 

11776 # Determine what other nodes use this storage 

11777 storage_users.register_storage_use(CONF.instances_path, CONF.host) 

11778 nodes = storage_users.get_storage_users(CONF.instances_path) 

11779 

11780 # Filter all_instances to only include those nodes which share this 

11781 # storage path. 

11782 # TODO(mikal): this should be further refactored so that the cache 

11783 # cleanup code doesn't know what those instances are, just a remote 

11784 # count, and then this logic should be pushed up the stack. 

11785 filters = {'deleted': False, 

11786 'soft_deleted': True, 

11787 'host': nodes} 

11788 filtered_instances = objects.InstanceList.get_by_filters(context, 

11789 filters, expected_attrs=[], use_slave=True) 

11790 

11791 self.driver.manage_image_cache(context, filtered_instances) 

11792 

11793 def cache_images(self, context, image_ids): 

11794 """Ask the virt driver to pre-cache a set of base images. 

11795 

11796 :param context: The RequestContext 

11797 :param image_ids: The image IDs to be cached 

11798 :return: A dict, keyed by image-id where the values are one of: 

11799 'cached' if the image was downloaded, 

11800 'existing' if the image was already in the cache, 

11801 'unsupported' if the virt driver does not support caching, 

11802 'error' if the virt driver raised an exception. 

11803 """ 

11804 

11805 results = {} 

11806 

11807 LOG.info('Caching %i image(s) by request', len(image_ids)) 

11808 for image_id in image_ids: 

11809 try: 

11810 cached = self.driver.cache_image(context, image_id) 

11811 if cached: 

11812 results[image_id] = 'cached' 

11813 else: 

11814 results[image_id] = 'existing' 

11815 except NotImplementedError: 

11816 LOG.warning('Virt driver does not support image pre-caching;' 

11817 ' ignoring request') 

11818 # NOTE(danms): Yes, technically we could short-circuit here to 

11819 # avoid trying the rest of the images, but it's very cheap to 

11820 # just keep hitting the NotImplementedError to keep the logic 

11821 # clean. 

11822 results[image_id] = 'unsupported' 

11823 except Exception as e: 

11824 results[image_id] = 'error' 

11825 LOG.error('Failed to cache image %(image_id)s: %(err)s', 

11826 {'image_id': image_id, 

11827 'err': e}) 

11828 

11829 return results 

11830 

11831 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval) 

11832 def _run_pending_deletes(self, context): 

11833 """Retry any pending instance file deletes.""" 

11834 LOG.debug('Cleaning up deleted instances') 

11835 filters = {'deleted': True, 

11836 'soft_deleted': False, 

11837 'host': CONF.host, 

11838 'cleaned': False} 

11839 attrs = ['system_metadata'] 

11840 with utils.temporary_mutation(context, read_deleted='yes'): 

11841 instances = objects.InstanceList.get_by_filters( 

11842 context, filters, expected_attrs=attrs, use_slave=True) 

11843 LOG.debug('There are %d instances to clean', len(instances)) 

11844 

11845 for instance in instances: 

11846 attempts = int(instance.system_metadata.get('clean_attempts', '0')) 

11847 LOG.debug('Instance has had %(attempts)s of %(max)s ' 

11848 'cleanup attempts', 

11849 {'attempts': attempts, 

11850 'max': CONF.maximum_instance_delete_attempts}, 

11851 instance=instance) 

11852 if attempts < CONF.maximum_instance_delete_attempts: 

11853 success = self.driver.delete_instance_files(instance) 

11854 

11855 instance.system_metadata['clean_attempts'] = str(attempts + 1) 

11856 if success: 

11857 instance.cleaned = True 

11858 with utils.temporary_mutation(context, read_deleted='yes'): 

11859 instance.save() 

11860 

11861 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval) 

11862 def _cleanup_incomplete_migrations(self, context): 

11863 """Cleanup on failed resize/revert-resize operation and 

11864 failed rollback live migration operation. 

11865 

11866 During resize/revert-resize operation, or after a failed rollback 

11867 live migration operation, if that instance gets deleted then instance 

11868 files might remain either on source or destination compute node and 

11869 other specific resources might not be cleaned up because of the race 

11870 condition. 

11871 """ 

11872 LOG.debug('Cleaning up deleted instances with incomplete migration ') 

11873 migration_filters = {'host': CONF.host, 

11874 'status': 'error'} 

11875 migrations = objects.MigrationList.get_by_filters(context, 

11876 migration_filters) 

11877 

11878 if not migrations: 11878 ↛ 11879line 11878 didn't jump to line 11879 because the condition on line 11878 was never true

11879 return 

11880 

11881 inst_uuid_from_migrations = set([migration.instance_uuid for migration 

11882 in migrations]) 

11883 

11884 inst_filters = {'deleted': True, 'soft_deleted': False, 

11885 'uuid': inst_uuid_from_migrations} 

11886 attrs = ['info_cache', 'security_groups', 'system_metadata'] 

11887 with utils.temporary_mutation(context, read_deleted='yes'): 

11888 instances = objects.InstanceList.get_by_filters( 

11889 context, inst_filters, expected_attrs=attrs, use_slave=True) 

11890 

11891 for instance in instances: 

11892 if instance.host == CONF.host: 11892 ↛ 11893line 11892 didn't jump to line 11893 because the condition on line 11892 was never true

11893 continue 

11894 for migration in migrations: 11894 ↛ 11891line 11894 didn't jump to line 11891 because the loop on line 11894 didn't complete

11895 if instance.uuid != migration.instance_uuid: 

11896 continue 

11897 self.driver.delete_instance_files(instance) 

11898 # we are not sure whether the migration_context is applied 

11899 # during incompleted migrating, we need to apply/revert 

11900 # migration_context to get instance object content matching 

11901 # current host. 

11902 revert = (True if migration.source_compute == CONF.host 

11903 else False) 

11904 with instance.mutated_migration_context(revert=revert): 

11905 self.driver.cleanup_lingering_instance_resources(instance) 

11906 

11907 try: 

11908 migration.status = 'failed' 

11909 migration.save() 

11910 except exception.MigrationNotFound: 

11911 LOG.warning("Migration %s is not found.", 

11912 migration.id, 

11913 instance=instance) 

11914 break 

11915 

11916 @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported, 

11917 exception.QemuGuestAgentNotEnabled, 

11918 exception.NovaException, 

11919 NotImplementedError) 

11920 @wrap_exception() 

11921 def quiesce_instance(self, context, instance): 

11922 """Quiesce an instance on this host.""" 

11923 context = context.elevated() 

11924 image_meta = objects.ImageMeta.from_instance(instance) 

11925 self.driver.quiesce(context, instance, image_meta) 

11926 

11927 def _wait_for_snapshots_completion(self, context, mapping): 

11928 for mapping_dict in mapping: 

11929 if mapping_dict.get('source_type') == 'snapshot': 11929 ↛ 11928line 11929 didn't jump to line 11928 because the condition on line 11929 was always true

11930 

11931 def _wait_snapshot(): 

11932 snapshot = self.volume_api.get_snapshot( 

11933 context, mapping_dict['snapshot_id']) 

11934 if snapshot.get('status') != 'creating': 

11935 raise loopingcall.LoopingCallDone() 

11936 

11937 timer = loopingcall.FixedIntervalLoopingCall(_wait_snapshot) 

11938 timer.start(interval=0.5).wait() 

11939 

11940 @messaging.expected_exceptions(exception.InstanceQuiesceNotSupported, 

11941 exception.QemuGuestAgentNotEnabled, 

11942 exception.NovaException, 

11943 NotImplementedError) 

11944 @wrap_exception() 

11945 def unquiesce_instance(self, context, instance, mapping=None): 

11946 """Unquiesce an instance on this host. 

11947 

11948 If snapshots' image mapping is provided, it waits until snapshots are 

11949 completed before unqueiscing. 

11950 """ 

11951 context = context.elevated() 

11952 if mapping: 11952 ↛ 11959line 11952 didn't jump to line 11959 because the condition on line 11952 was always true

11953 try: 

11954 self._wait_for_snapshots_completion(context, mapping) 

11955 except Exception as error: 

11956 LOG.exception("Exception while waiting completion of " 

11957 "volume snapshots: %s", 

11958 error, instance=instance) 

11959 image_meta = objects.ImageMeta.from_instance(instance) 

11960 self.driver.unquiesce(context, instance, image_meta) 

11961 

11962 @periodic_task.periodic_task(spacing=CONF.instance_delete_interval) 

11963 def _cleanup_expired_console_auth_tokens(self, context): 

11964 """Remove all expired console auth tokens. 

11965 

11966 Console authorization tokens and their connection data are stored 

11967 in the database when a user asks for a console connection to an 

11968 instance. After a time they expire. We periodically remove any expired 

11969 tokens from the database. 

11970 """ 

11971 objects.ConsoleAuthToken.clean_expired_console_auths(context) 

11972 

11973 def _claim_pci_for_instance_vifs(self, ctxt, instance): 

11974 """Claim PCI devices for the instance's VIFs on the compute node 

11975 

11976 :param ctxt: Context 

11977 :param instance: Instance object 

11978 :return: <port ID: PciDevice> mapping for the VIFs that yielded a 

11979 PCI claim on the compute node 

11980 """ 

11981 pci_req_id_to_port_id = {} 

11982 pci_reqs = [] 

11983 port_id_to_pci_dev = {} 

11984 

11985 for vif in instance.get_network_info(): 

11986 pci_req = pci_req_module.get_instance_pci_request_from_vif( 

11987 ctxt, 

11988 instance, 

11989 vif) 

11990 if pci_req: 

11991 pci_req_id_to_port_id[pci_req.request_id] = vif['id'] 

11992 pci_reqs.append(pci_req) 

11993 

11994 if pci_reqs: 

11995 # Create PCI requests and claim against PCI resource tracker 

11996 # NOTE(adrianc): We claim against the same requests as on the 

11997 # source node. 

11998 vif_pci_requests = objects.InstancePCIRequests( 

11999 requests=pci_reqs, 

12000 instance_uuid=instance.uuid) 

12001 

12002 # if we are called during the live migration with NUMA topology 

12003 # support the PCI claim needs to consider the destination NUMA 

12004 # topology that is then stored in the migration_context 

12005 dest_topo = None 

12006 if instance.migration_context: 12006 ↛ 12009line 12006 didn't jump to line 12009 because the condition on line 12006 was always true

12007 dest_topo = instance.migration_context.new_numa_topology 

12008 

12009 claimed_pci_devices_objs = self.rt.claim_pci_devices( 

12010 ctxt, vif_pci_requests, dest_topo) 

12011 

12012 # Update VIFMigrateData profile with the newly claimed PCI 

12013 # device 

12014 for pci_dev in claimed_pci_devices_objs: 

12015 LOG.debug("PCI device: %s Claimed on destination node", 

12016 pci_dev.address) 

12017 port_id = pci_req_id_to_port_id[pci_dev.request_id] 

12018 port_id_to_pci_dev[port_id] = pci_dev 

12019 

12020 return port_id_to_pci_dev 

12021 

12022 def _update_migrate_vifs_profile_with_pci(self, 

12023 migrate_vifs, 

12024 port_id_to_pci_dev): 

12025 """Update migrate vifs profile with the claimed PCI devices 

12026 

12027 :param migrate_vifs: list of VIFMigrateData objects 

12028 :param port_id_to_pci_dev: a <port_id: PciDevice> mapping 

12029 :return: None. 

12030 """ 

12031 for mig_vif in migrate_vifs: 

12032 port_id = mig_vif.port_id 

12033 if port_id not in port_id_to_pci_dev: 

12034 continue 

12035 

12036 pci_dev = port_id_to_pci_dev[port_id] 

12037 profile = copy.deepcopy(mig_vif.source_vif['profile']) 

12038 profile['pci_slot'] = pci_dev.address 

12039 profile['pci_vendor_info'] = ':'.join([pci_dev.vendor_id, 

12040 pci_dev.product_id]) 

12041 if profile.get('card_serial_number'): 12041 ↛ 12044line 12041 didn't jump to line 12044 because the condition on line 12041 was never true

12042 # Assume it is there since Nova makes sure that PCI devices 

12043 # tagged as remote-managed have a serial in PCI VPD. 

12044 profile['card_serial_number'] = pci_dev.card_serial_number 

12045 if profile.get('pf_mac_address'): 12045 ↛ 12046line 12045 didn't jump to line 12046 because the condition on line 12045 was never true

12046 profile['pf_mac_address'] = pci_dev.sriov_cap['pf_mac_address'] 

12047 if profile.get('vf_num'): 12047 ↛ 12048line 12047 didn't jump to line 12048 because the condition on line 12047 was never true

12048 profile['vf_num'] = pci_dev.sriov_cap['vf_num'] 

12049 

12050 if pci_dev.mac_address: 

12051 profile['device_mac_address'] = pci_dev.mac_address 

12052 

12053 mig_vif.profile = profile 

12054 LOG.debug("Updating migrate VIF profile for port %(port_id)s:" 

12055 "%(profile)s", {'port_id': port_id, 

12056 'profile': profile}) 

12057 

12058 

12059# TODO(sbauza): Remove this proxy class in the X release once we drop the 5.x 

12060# support. 

12061# NOTE(sbauza): This proxy class will support the existing <=5.13 RPC calls 

12062# from any RPC client but will also make sure that the new 6.0 RPC calls will 

12063# be supported. 

12064class _ComputeV5Proxy(object): 

12065 

12066 target = messaging.Target(version='5.13') 

12067 

12068 def __init__(self, manager): 

12069 self.manager = manager 

12070 

12071 def __getattr__(self, name): 

12072 # NOTE(sbauza): Proxying all the other methods but the V5 ones. 

12073 return getattr(self.manager, name) 

12074 

12075 # 5.0 support for block_migration argument 

12076 def pre_live_migration(self, context, instance, block_migration, disk, 

12077 migrate_data): 

12078 return self.manager.pre_live_migration(context, instance, disk, 

12079 migrate_data) 

12080 

12081 # 5.1 support for legacy request_spec argument 

12082 def prep_resize(self, context, image, instance, instance_type, 

12083 request_spec, filter_properties, node, 

12084 clean_shutdown, migration, host_list): 

12085 if not isinstance(request_spec, objects.RequestSpec): 

12086 # Prior to compute RPC API 5.1 conductor would pass a legacy dict 

12087 # version of the request spec to compute and since Stein compute 

12088 # could be sending that back to conductor on reschedule, so if we 

12089 # got a dict convert it to an object. 

12090 # TODO(mriedem): We can drop this compat code when we only support 

12091 # compute RPC API >=6.0. 

12092 request_spec = objects.RequestSpec.from_primitives( 

12093 context, request_spec, filter_properties) 

12094 # We don't have to set the new flavor on the request spec because 

12095 # if we got here it was due to a reschedule from the compute and 

12096 # the request spec would already have the new flavor in it from the 

12097 # else block below. 

12098 self.manager.prep_resize(context, image, instance, instance_type, 

12099 request_spec, filter_properties, node, 

12100 clean_shutdown, migration, host_list) 

12101 

12102 # 5.2 support for optional request_spec argument 

12103 def resize_instance(self, context, instance, image, 

12104 migration, instance_type, clean_shutdown, 

12105 request_spec=None): 

12106 self.manager.resize_instance(context, instance, image, 

12107 migration, instance_type, clean_shutdown, 

12108 request_spec) 

12109 

12110 # 5.2 support for optional request_spec argument 

12111 def finish_resize(self, context, disk_info, image, instance, 

12112 migration, request_spec=None): 

12113 self.manager.finish_resize(context, disk_info, image, instance, 

12114 migration, request_spec) 

12115 

12116 # 5.2 support for optional request_spec argument 

12117 def revert_resize(self, context, instance, migration, request_spec=None): 

12118 self.manager.revert_resize(context, instance, migration, request_spec) 

12119 

12120 # 5.2 support for optional request_spec argument 

12121 def finish_revert_resize( 

12122 self, context, instance, migration, request_spec=None): 

12123 self.manager.finish_revert_resize(context, instance, migration, 

12124 request_spec) 

12125 

12126 # 5.2 support for optional request_spec argument 

12127 # 5.13 support for optional accel_uuids argument 

12128 def unshelve_instance(self, context, instance, image, filter_properties, 

12129 node, request_spec=None, accel_uuids=None): 

12130 self.manager.unshelve_instance(context, instance, image, 

12131 filter_properties, node, request_spec, 

12132 accel_uuids or []) 

12133 

12134 # 5.3 support for optional migration and limits arguments 

12135 def check_can_live_migrate_destination(self, ctxt, instance, 

12136 block_migration, disk_over_commit, 

12137 migration=None, limits=None): 

12138 return self.manager.check_can_live_migrate_destination( 

12139 ctxt, instance, block_migration, disk_over_commit, 

12140 migration, limits) 

12141 

12142 # 5.11 support for optional accel_uuids argument 

12143 def build_and_run_instance(self, context, instance, image, request_spec, 

12144 filter_properties, admin_password=None, 

12145 injected_files=None, requested_networks=None, 

12146 security_groups=None, block_device_mapping=None, 

12147 node=None, limits=None, host_list=None, accel_uuids=None): 

12148 self.manager.build_and_run_instance( 

12149 context, instance, image, request_spec, 

12150 filter_properties, accel_uuids, admin_password, 

12151 injected_files, requested_networks, 

12152 security_groups, block_device_mapping, 

12153 node, limits, host_list) 

12154 

12155 # 5.12 support for optional accel_uuids argument 

12156 def rebuild_instance(self, context, instance, orig_image_ref, image_ref, 

12157 injected_files, new_pass, orig_sys_metadata, 

12158 bdms, recreate, on_shared_storage, 

12159 preserve_ephemeral, migration, 

12160 scheduled_node, limits, request_spec, 

12161 accel_uuids=None): 

12162 self.manager.rebuild_instance( 

12163 context, instance, orig_image_ref, image_ref, 

12164 injected_files, new_pass, orig_sys_metadata, 

12165 bdms, recreate, on_shared_storage, 

12166 preserve_ephemeral, migration, 

12167 scheduled_node, limits, request_spec, 

12168 accel_uuids, False, None) 

12169 

12170 # 5.13 support for optional accel_uuids argument 

12171 def shelve_instance(self, context, instance, image_id, 

12172 clean_shutdown, accel_uuids=None): 

12173 self.manager.shelve_instance(context, instance, image_id, 

12174 clean_shutdown, accel_uuids) 

12175 

12176 # 5.13 support for optional accel_uuids argument 

12177 def shelve_offload_instance(self, context, instance, clean_shutdown, 

12178 accel_uuids=None): 

12179 self.manager.shelve_offload_instance( 

12180 context, instance, clean_shutdown, accel_uuids) 

12181 

12182 # 6.0 drop unused request_spec argument 

12183 def prep_snapshot_based_resize_at_dest( 

12184 self, ctxt, instance, flavor, nodename, migration, limits, 

12185 request_spec): 

12186 return self.manager.prep_snapshot_based_resize_at_dest( 

12187 ctxt, instance, flavor, nodename, migration, limits) 

12188 

12189 # 6.0 drop unused request_spec argument 

12190 def finish_snapshot_based_resize_at_dest( 

12191 self, ctxt, instance, migration, snapshot_id, request_spec): 

12192 self.manager.finish_snapshot_based_resize_at_dest( 

12193 ctxt, instance, migration, snapshot_id) 

12194 

12195 # 6.0 drop unused instance argument 

12196 def check_instance_shared_storage(self, ctxt, instance, data): 

12197 return self.manager.check_instance_shared_storage(ctxt, data)