Coverage for nova/conductor/manager.py: 89%

925 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-24 11:16 +0000

1# Copyright 2013 IBM Corp. 

2# 

3# Licensed under the Apache License, Version 2.0 (the "License"); you may 

4# not use this file except in compliance with the License. You may obtain 

5# a copy of the License at 

6# 

7# http://www.apache.org/licenses/LICENSE-2.0 

8# 

9# Unless required by applicable law or agreed to in writing, software 

10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

12# License for the specific language governing permissions and limitations 

13# under the License. 

14 

15"""Handles database requests from other nova services.""" 

16 

17import collections 

18import contextlib 

19import copy 

20import eventlet 

21import functools 

22import sys 

23import typing as ty 

24 

25from keystoneauth1 import exceptions as ks_exc 

26from oslo_config import cfg 

27from oslo_db import exception as db_exc 

28from oslo_limit import exception as limit_exceptions 

29from oslo_log import log as logging 

30import oslo_messaging as messaging 

31from oslo_serialization import jsonutils 

32from oslo_utils import excutils 

33from oslo_utils import timeutils 

34from oslo_utils import versionutils 

35 

36from nova.accelerator import cyborg 

37from nova import availability_zones 

38from nova.compute import instance_actions 

39from nova.compute import rpcapi as compute_rpcapi 

40from nova.compute import task_states 

41from nova.compute import utils as compute_utils 

42from nova.compute.utils import wrap_instance_event 

43from nova.compute import vm_states 

44from nova.conductor.tasks import cross_cell_migrate 

45from nova.conductor.tasks import live_migrate 

46from nova.conductor.tasks import migrate 

47from nova import context as nova_context 

48from nova import exception 

49from nova.i18n import _ 

50from nova.image import glance 

51from nova.limit import placement as placement_limits 

52from nova.limit import utils as limit_utils 

53from nova import manager 

54from nova.network import neutron 

55from nova import notifications 

56from nova import objects 

57from nova.objects import base as nova_object 

58from nova.objects import fields 

59from nova import profiler 

60from nova import rpc 

61from nova.scheduler.client import query 

62from nova.scheduler.client import report 

63from nova.scheduler import utils as scheduler_utils 

64from nova import servicegroup 

65from nova import utils 

66from nova.volume import cinder 

67 

68LOG = logging.getLogger(__name__) 

69CONF = cfg.CONF 

70 

71 

72def targets_cell(fn): 

73 """Wrap a method and automatically target the instance's cell. 

74 

75 This decorates a method with signature func(self, context, instance, ...) 

76 and automatically targets the context with the instance's cell 

77 mapping. It does this by looking up the InstanceMapping. 

78 """ 

79 @functools.wraps(fn) 

80 def wrapper(self, context, *args, **kwargs): 

81 instance = kwargs.get('instance') or args[0] 

82 try: 

83 im = objects.InstanceMapping.get_by_instance_uuid( 

84 context, instance.uuid) 

85 except exception.InstanceMappingNotFound: 

86 LOG.error('InstanceMapping not found, unable to target cell', 

87 instance=instance) 

88 except db_exc.CantStartEngineError: 

89 # Check to see if we can ignore API DB connection failures 

90 # because we might already be in the cell conductor. 

91 with excutils.save_and_reraise_exception() as err_ctxt: 

92 if CONF.api_database.connection is None: 

93 err_ctxt.reraise = False 

94 else: 

95 LOG.debug('Targeting cell %(cell)s for conductor method %(meth)s', 

96 {'cell': im.cell_mapping.identity, 

97 'meth': fn.__name__}) 

98 # NOTE(danms): Target our context to the cell for the rest of 

99 # this request, so that none of the subsequent code needs to 

100 # care about it. 

101 nova_context.set_target_cell(context, im.cell_mapping) 

102 return fn(self, context, *args, **kwargs) 

103 return wrapper 

104 

105 

106class ConductorManager(manager.Manager): 

107 """Mission: Conduct things. 

108 

109 The methods in the base API for nova-conductor are various proxy operations 

110 performed on behalf of the nova-compute service running on compute nodes. 

111 Compute nodes are not allowed to directly access the database, so this set 

112 of methods allows them to get specific work done without locally accessing 

113 the database. 

114 

115 The nova-conductor service also exposes an API in the 'compute_task' 

116 namespace. See the ComputeTaskManager class for details. 

117 """ 

118 

119 target = messaging.Target(version='3.0') 

120 

121 def __init__(self, *args, **kwargs): 

122 super(ConductorManager, self).__init__(service_name='conductor', 

123 *args, **kwargs) 

124 self.compute_task_mgr = ComputeTaskManager() 

125 self.additional_endpoints.append(self.compute_task_mgr) 

126 

127 # NOTE(hanlind): This can be removed in version 4.0 of the RPC API 

128 def provider_fw_rule_get_all(self, context): 

129 # NOTE(hanlind): Simulate an empty db result for compat reasons. 

130 return [] 

131 

132 def _object_dispatch(self, target, method, args, kwargs): 

133 """Dispatch a call to an object method. 

134 

135 This ensures that object methods get called and any exception 

136 that is raised gets wrapped in an ExpectedException for forwarding 

137 back to the caller (without spamming the conductor logs). 

138 """ 

139 try: 

140 # NOTE(danms): Keep the getattr inside the try block since 

141 # a missing method is really a client problem 

142 return getattr(target, method)(*args, **kwargs) 

143 except Exception: 

144 raise messaging.ExpectedException() 

145 

146 def object_class_action_versions(self, context, objname, objmethod, 

147 object_versions, args, kwargs): 

148 objclass = nova_object.NovaObject.obj_class_from_name( 

149 objname, object_versions[objname]) 

150 args = tuple([context] + list(args)) 

151 result = self._object_dispatch(objclass, objmethod, args, kwargs) 

152 # NOTE(danms): The RPC layer will convert to primitives for us, 

153 # but in this case, we need to honor the version the client is 

154 # asking for, so we do it before returning here. 

155 # NOTE(hanlind): Do not convert older than requested objects, 

156 # see bug #1596119. 

157 if isinstance(result, nova_object.NovaObject): 

158 target_version = object_versions[objname] 

159 requested_version = versionutils.convert_version_to_tuple( 

160 target_version) 

161 actual_version = versionutils.convert_version_to_tuple( 

162 result.VERSION) 

163 do_backport = requested_version < actual_version 

164 other_major_version = requested_version[0] != actual_version[0] 

165 if do_backport or other_major_version: 

166 result = result.obj_to_primitive( 

167 target_version=target_version, 

168 version_manifest=object_versions) 

169 return result 

170 

171 def object_action(self, context, objinst, objmethod, args, kwargs): 

172 """Perform an action on an object.""" 

173 oldobj = objinst.obj_clone() 

174 result = self._object_dispatch(objinst, objmethod, args, kwargs) 

175 updates = dict() 

176 # NOTE(danms): Diff the object with the one passed to us and 

177 # generate a list of changes to forward back 

178 for name, field in objinst.fields.items(): 

179 if not objinst.obj_attr_is_set(name): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was never true

180 # Avoid demand-loading anything 

181 continue 

182 if (not oldobj.obj_attr_is_set(name) or 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true

183 getattr(oldobj, name) != getattr(objinst, name)): 

184 updates[name] = field.to_primitive(objinst, name, 

185 getattr(objinst, name)) 

186 # This is safe since a field named this would conflict with the 

187 # method anyway 

188 updates['obj_what_changed'] = objinst.obj_what_changed() 

189 return updates, result 

190 

191 def object_backport_versions(self, context, objinst, object_versions): 

192 target = object_versions[objinst.obj_name()] 

193 LOG.debug('Backporting %(obj)s to %(ver)s with versions %(manifest)s', 

194 {'obj': objinst.obj_name(), 

195 'ver': target, 

196 'manifest': ','.join( 

197 ['%s=%s' % (name, ver) 

198 for name, ver in object_versions.items()])}) 

199 return objinst.obj_to_primitive(target_version=target, 

200 version_manifest=object_versions) 

201 

202 def reset(self): 

203 objects.Service.clear_min_version_cache() 

204 

205 

206@contextlib.contextmanager 

207def try_target_cell(context, cell): 

208 """If cell is not None call func with context.target_cell. 

209 

210 This is a method to help during the transition period. Currently 

211 various mappings may not exist if a deployment has not migrated to 

212 cellsv2. If there is no mapping call the func as normal, otherwise 

213 call it in a target_cell context. 

214 """ 

215 if cell: 215 ↛ 219line 215 didn't jump to line 219 because the condition on line 215 was always true

216 with nova_context.target_cell(context, cell) as cell_context: 

217 yield cell_context 

218 else: 

219 yield context 

220 

221 

222@contextlib.contextmanager 

223def obj_target_cell(obj, cell): 

224 """Run with object's context set to a specific cell""" 

225 with try_target_cell(obj._context, cell) as target: 

226 with obj.obj_alternate_context(target): 

227 yield target 

228 

229 

230@profiler.trace_cls("rpc") 

231class ComputeTaskManager: 

232 """Namespace for compute methods. 

233 

234 This class presents an rpc API for nova-conductor under the 'compute_task' 

235 namespace. The methods here are compute operations that are invoked 

236 by the API service. These methods see the operation to completion, which 

237 may involve coordinating activities on multiple compute nodes. 

238 """ 

239 

240 target = messaging.Target(namespace='compute_task', version='1.25') 

241 

242 def __init__(self): 

243 self.compute_rpcapi = compute_rpcapi.ComputeAPI() 

244 self.volume_api = cinder.API() 

245 self.image_api = glance.API() 

246 self.network_api = neutron.API() 

247 self.servicegroup_api = servicegroup.API() 

248 self.query_client = query.SchedulerQueryClient() 

249 self.notifier = rpc.get_notifier('compute') 

250 # Help us to record host in EventReporter 

251 self.host = CONF.host 

252 

253 try: 

254 # Test our placement client during initialization 

255 self.report_client 

256 except (ks_exc.EndpointNotFound, 

257 ks_exc.DiscoveryFailure, 

258 ks_exc.RequestTimeout, 

259 ks_exc.GatewayTimeout, 

260 ks_exc.ConnectFailure) as e: 

261 # Non-fatal, likely transient (although not definitely); 

262 # continue startup but log the warning so that when things 

263 # fail later, it will be clear why we can not do certain 

264 # things. 

265 LOG.warning('Unable to initialize placement client (%s); ' 

266 'Continuing with startup, but some operations ' 

267 'will not be possible.', e) 

268 except (ks_exc.MissingAuthPlugin, 

269 ks_exc.Unauthorized) as e: 

270 # This is almost definitely fatal mis-configuration. The 

271 # Unauthorized error might be transient, but it is 

272 # probably reasonable to consider it fatal. 

273 LOG.error('Fatal error initializing placement client; ' 

274 'config is incorrect or incomplete: %s', e) 

275 raise 

276 except Exception as e: 

277 # Unknown/unexpected errors here are fatal 

278 LOG.error('Fatal error initializing placement client: %s', e) 

279 raise 

280 

281 @property 

282 def report_client(self): 

283 return report.report_client_singleton() 

284 

285 def reset(self): 

286 LOG.info('Reloading compute RPC API') 

287 compute_rpcapi.LAST_VERSION = None 

288 self.compute_rpcapi = compute_rpcapi.ComputeAPI() 

289 

290 # TODO(tdurakov): remove `live` parameter here on compute task api RPC 

291 # version bump to 2.x 

292 # TODO(danms): remove the `reservations` parameter here on compute task api 

293 # RPC version bump to 2.x 

294 @messaging.expected_exceptions( 

295 exception.NoValidHost, 

296 exception.ComputeServiceUnavailable, 

297 exception.ComputeHostNotFound, 

298 exception.InvalidHypervisorType, 

299 exception.InvalidCPUInfo, 

300 exception.UnableToMigrateToSelf, 

301 exception.DestinationHypervisorTooOld, 

302 exception.InvalidLocalStorage, 

303 exception.InvalidSharedStorage, 

304 exception.HypervisorUnavailable, 

305 exception.InstanceInvalidState, 

306 exception.MigrationPreCheckError, 

307 exception.UnsupportedPolicyException) 

308 @targets_cell 

309 @wrap_instance_event(prefix='conductor') 

310 def migrate_server(self, context, instance, scheduler_hint, live, rebuild, 

311 flavor, block_migration, disk_over_commit, reservations=None, 

312 clean_shutdown=True, request_spec=None, host_list=None): 

313 if instance and not isinstance(instance, nova_object.NovaObject): 313 ↛ 316line 313 didn't jump to line 316 because the condition on line 313 was never true

314 # NOTE(danms): Until v2 of the RPC API, we need to tolerate 

315 # old-world instance objects here 

316 attrs = ['metadata', 'system_metadata', 'info_cache', 

317 'security_groups'] 

318 instance = objects.Instance._from_db_object( 

319 context, objects.Instance(), instance, 

320 expected_attrs=attrs) 

321 # NOTE: Remove this when we drop support for v1 of the RPC API 

322 if flavor and not isinstance(flavor, objects.Flavor): 322 ↛ 325line 322 didn't jump to line 325 because the condition on line 322 was never true

323 # Code downstream may expect extra_specs to be populated since it 

324 # is receiving an object, so lookup the flavor to ensure this. 

325 flavor = objects.Flavor.get_by_id(context, flavor['id']) 

326 if live and not rebuild and not flavor: 

327 self._live_migrate(context, instance, scheduler_hint, 

328 block_migration, disk_over_commit, request_spec) 

329 elif not live and not rebuild and flavor: 

330 instance_uuid = instance.uuid 

331 with compute_utils.EventReporter(context, 'cold_migrate', 

332 self.host, instance_uuid): 

333 self._cold_migrate(context, instance, flavor, 

334 scheduler_hint['filter_properties'], 

335 clean_shutdown, request_spec, 

336 host_list) 

337 else: 

338 raise NotImplementedError() 

339 

340 @staticmethod 

341 def _get_request_spec_for_cold_migrate(context, instance, flavor, 

342 filter_properties, request_spec): 

343 # NOTE(sbauza): If a reschedule occurs when prep_resize(), then 

344 # it only provides filter_properties legacy dict back to the 

345 # conductor with no RequestSpec part of the payload for <Stein 

346 # computes. 

347 # TODO(mriedem): We can remove this compat code for no request spec 

348 # coming to conductor in ComputeTaskAPI RPC API version 2.0 

349 if not request_spec: 

350 image_meta = utils.get_image_from_system_metadata( 

351 instance.system_metadata) 

352 # Make sure we hydrate a new RequestSpec object with the new flavor 

353 # and not the nested one from the instance 

354 request_spec = objects.RequestSpec.from_components( 

355 context, instance.uuid, image_meta, 

356 flavor, instance.numa_topology, instance.pci_requests, 

357 filter_properties, None, instance.availability_zone, 

358 project_id=instance.project_id, user_id=instance.user_id) 

359 elif not isinstance(request_spec, objects.RequestSpec): 

360 # Prior to compute RPC API 5.1 conductor would pass a legacy dict 

361 # version of the request spec to compute and Stein compute 

362 # could be sending that back to conductor on reschedule, so if we 

363 # got a dict convert it to an object. 

364 # TODO(mriedem): We can drop this compat code when we only support 

365 # compute RPC API >=6.0. 

366 request_spec = objects.RequestSpec.from_primitives( 

367 context, request_spec, filter_properties) 

368 # We don't have to set the new flavor on the request spec because 

369 # if we got here it was due to a reschedule from the compute and 

370 # the request spec would already have the new flavor in it from the 

371 # else block below. 

372 else: 

373 # NOTE(sbauza): Resizes means new flavor, so we need to update the 

374 # original RequestSpec object for make sure the scheduler verifies 

375 # the right one and not the original flavor 

376 request_spec.flavor = flavor 

377 return request_spec 

378 

379 def _cold_migrate(self, context, instance, flavor, filter_properties, 

380 clean_shutdown, request_spec, host_list): 

381 request_spec = self._get_request_spec_for_cold_migrate( 

382 context, instance, flavor, filter_properties, request_spec) 

383 

384 task = self._build_cold_migrate_task(context, instance, flavor, 

385 request_spec, clean_shutdown, host_list) 

386 try: 

387 task.execute() 

388 except exception.NoValidHost as ex: 

389 vm_state = instance.vm_state 

390 if not vm_state: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 vm_state = vm_states.ACTIVE 

392 updates = {'vm_state': vm_state, 'task_state': None} 

393 self._set_vm_state_and_notify(context, instance.uuid, 

394 'migrate_server', 

395 updates, ex, request_spec) 

396 

397 # if the flavor IDs match, it's migrate; otherwise resize 

398 if flavor.id == instance.instance_type_id: 

399 msg = _("No valid host found for cold migrate") 

400 else: 

401 msg = _("No valid host found for resize") 

402 raise exception.NoValidHost(reason=msg) 

403 except exception.UnsupportedPolicyException as ex: 

404 with excutils.save_and_reraise_exception(): 

405 vm_state = instance.vm_state 

406 if not vm_state: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 vm_state = vm_states.ACTIVE 

408 updates = {'vm_state': vm_state, 'task_state': None} 

409 self._set_vm_state_and_notify(context, instance.uuid, 

410 'migrate_server', 

411 updates, ex, request_spec) 

412 except Exception as ex: 

413 with excutils.save_and_reraise_exception(): 

414 # Refresh the instance so we don't overwrite vm_state changes 

415 # set after we executed the task. 

416 try: 

417 instance.refresh() 

418 # Passing vm_state is kind of silly but it's expected in 

419 # set_vm_state_and_notify. 

420 updates = {'vm_state': instance.vm_state, 

421 'task_state': None} 

422 self._set_vm_state_and_notify(context, instance.uuid, 

423 'migrate_server', 

424 updates, ex, request_spec) 

425 except exception.InstanceNotFound: 

426 # We can't send the notification because the instance is 

427 # gone so just log it. 

428 LOG.info('During %s the instance was deleted.', 

429 'resize' if instance.instance_type_id != flavor.id 

430 else 'cold migrate', instance=instance) 

431 # NOTE(sbauza): Make sure we persist the new flavor in case we had 

432 # a successful scheduler call if and only if nothing bad happened 

433 if request_spec.obj_what_changed(): 433 ↛ exitline 433 didn't return from function '_cold_migrate' because the condition on line 433 was always true

434 request_spec.save() 

435 

436 def _set_vm_state_and_notify(self, context, instance_uuid, method, updates, 

437 ex, request_spec): 

438 scheduler_utils.set_vm_state_and_notify( 

439 context, instance_uuid, 'compute_task', method, updates, 

440 ex, request_spec) 

441 

442 def _cleanup_allocated_networks( 

443 self, context, instance, requested_networks): 

444 try: 

445 # If we were told not to allocate networks let's save ourselves 

446 # the trouble of calling the network API. 

447 if not (requested_networks and requested_networks.no_allocate): 

448 self.network_api.deallocate_for_instance( 

449 context, instance, requested_networks=requested_networks) 

450 except Exception: 

451 LOG.exception('Failed to deallocate networks', instance=instance) 

452 return 

453 

454 instance.system_metadata['network_allocated'] = 'False' 

455 try: 

456 instance.save() 

457 except exception.InstanceNotFound: 

458 # NOTE: It's possible that we're cleaning up the networks 

459 # because the instance was deleted. If that's the case then this 

460 # exception will be raised by instance.save() 

461 pass 

462 

463 @targets_cell 

464 @wrap_instance_event(prefix='conductor') 

465 def live_migrate_instance(self, context, instance, scheduler_hint, 

466 block_migration, disk_over_commit, request_spec): 

467 self._live_migrate(context, instance, scheduler_hint, 

468 block_migration, disk_over_commit, request_spec) 

469 

470 def _live_migrate(self, context, instance, scheduler_hint, 

471 block_migration, disk_over_commit, request_spec): 

472 destination = scheduler_hint.get("host") 

473 

474 def _set_vm_state(context, instance, ex, vm_state=None, 

475 task_state=None): 

476 request_spec = {'instance_properties': { 

477 'uuid': instance.uuid, }, 

478 } 

479 scheduler_utils.set_vm_state_and_notify(context, 

480 instance.uuid, 

481 'compute_task', 'migrate_server', 

482 dict(vm_state=vm_state, 

483 task_state=task_state, 

484 expected_task_state=task_states.MIGRATING,), 

485 ex, request_spec) 

486 

487 migration = objects.Migration(context=context.elevated()) 

488 migration.dest_compute = destination 

489 migration.status = 'accepted' 

490 migration.instance_uuid = instance.uuid 

491 migration.source_compute = instance.host 

492 migration.migration_type = fields.MigrationType.LIVE_MIGRATION 

493 if instance.obj_attr_is_set('flavor'): 

494 migration.old_instance_type_id = instance.flavor.id 

495 migration.new_instance_type_id = instance.flavor.id 

496 else: 

497 migration.old_instance_type_id = instance.instance_type_id 

498 migration.new_instance_type_id = instance.instance_type_id 

499 migration.create() 

500 

501 task = self._build_live_migrate_task(context, instance, destination, 

502 block_migration, disk_over_commit, 

503 migration, request_spec) 

504 try: 

505 task.execute() 

506 except (exception.NoValidHost, 

507 exception.ComputeHostNotFound, 

508 exception.ComputeServiceUnavailable, 

509 exception.InvalidHypervisorType, 

510 exception.InvalidCPUInfo, 

511 exception.UnableToMigrateToSelf, 

512 exception.DestinationHypervisorTooOld, 

513 exception.InvalidLocalStorage, 

514 exception.InvalidSharedStorage, 

515 exception.HypervisorUnavailable, 

516 exception.InstanceInvalidState, 

517 exception.MigrationPreCheckError, 

518 exception.MigrationSchedulerRPCError) as ex: 

519 with excutils.save_and_reraise_exception(): 

520 _set_vm_state(context, instance, ex, instance.vm_state) 

521 migration.status = 'error' 

522 migration.save() 

523 except Exception as ex: 

524 LOG.error('Migration of instance %(instance_id)s to host' 

525 ' %(dest)s unexpectedly failed.', 

526 {'instance_id': instance.uuid, 'dest': destination}, 

527 exc_info=True) 

528 # Reset the task state to None to indicate completion of 

529 # the operation as it is done in case of known exceptions. 

530 _set_vm_state(context, instance, ex, vm_states.ERROR, 

531 task_state=None) 

532 migration.status = 'error' 

533 migration.save() 

534 raise exception.MigrationError(reason=str(ex)) 

535 

536 def _build_live_migrate_task(self, context, instance, destination, 

537 block_migration, disk_over_commit, migration, 

538 request_spec=None): 

539 return live_migrate.LiveMigrationTask(context, instance, 

540 destination, block_migration, 

541 disk_over_commit, migration, 

542 self.compute_rpcapi, 

543 self.servicegroup_api, 

544 self.query_client, 

545 self.report_client, 

546 request_spec) 

547 

548 def _build_cold_migrate_task(self, context, instance, flavor, request_spec, 

549 clean_shutdown, host_list): 

550 return migrate.MigrationTask(context, instance, flavor, 

551 request_spec, clean_shutdown, 

552 self.compute_rpcapi, 

553 self.query_client, self.report_client, 

554 host_list, self.network_api) 

555 

556 def _destroy_build_request(self, context, instance): 

557 # The BuildRequest needs to be stored until the instance is mapped to 

558 # an instance table. At that point it will never be used again and 

559 # should be deleted. 

560 build_request = objects.BuildRequest.get_by_instance_uuid( 

561 context, instance.uuid) 

562 # TODO(alaski): Sync API updates of the build_request to the 

563 # instance before it is destroyed. Right now only locked_by can 

564 # be updated before this is destroyed. 

565 build_request.destroy() 

566 

567 def _populate_instance_mapping(self, context, instance, host): 

568 try: 

569 inst_mapping = objects.InstanceMapping.get_by_instance_uuid( 

570 context, instance.uuid) 

571 except exception.InstanceMappingNotFound: 

572 # NOTE(alaski): If nova-api is up to date this exception should 

573 # never be hit. But during an upgrade it's possible that an old 

574 # nova-api didn't create an instance_mapping during this boot 

575 # request. 

576 LOG.debug('Instance was not mapped to a cell, likely due ' 

577 'to an older nova-api service running.', 

578 instance=instance) 

579 return None 

580 else: 

581 try: 

582 host_mapping = objects.HostMapping.get_by_host(context, 

583 host.service_host) 

584 except exception.HostMappingNotFound: 

585 # NOTE(alaski): For now this exception means that a 

586 # deployment has not migrated to cellsv2 and we should 

587 # remove the instance_mapping that has been created. 

588 # Eventually this will indicate a failure to properly map a 

589 # host to a cell and we may want to reschedule. 

590 inst_mapping.destroy() 

591 return None 

592 else: 

593 inst_mapping.cell_mapping = host_mapping.cell_mapping 

594 inst_mapping.save() 

595 return inst_mapping 

596 

597 def _validate_existing_attachment_ids(self, context, instance, bdms): 

598 """Ensure any attachment ids referenced by the bdms exist. 

599 

600 New attachments will only be created if the attachment ids referenced 

601 by the bdms no longer exist. This can happen when an instance is 

602 rescheduled after a failure to spawn as cleanup code on the previous 

603 host will delete attachments before rescheduling. 

604 """ 

605 for bdm in bdms: 

606 if bdm.is_volume and bdm.attachment_id: 606 ↛ 605line 606 didn't jump to line 605 because the condition on line 606 was always true

607 try: 

608 self.volume_api.attachment_get(context, bdm.attachment_id) 

609 except exception.VolumeAttachmentNotFound: 

610 attachment = self.volume_api.attachment_create( 

611 context, bdm.volume_id, instance.uuid) 

612 bdm.attachment_id = attachment['id'] 

613 bdm.save() 

614 

615 def _cleanup_when_reschedule_fails( 

616 self, context, instance, exception, legacy_request_spec, 

617 requested_networks): 

618 """Set the instance state and clean up. 

619 

620 It is only used in case build_instance fails while rescheduling the 

621 instance 

622 """ 

623 

624 updates = {'vm_state': vm_states.ERROR, 

625 'task_state': None} 

626 self._set_vm_state_and_notify( 

627 context, instance.uuid, 'build_instances', updates, exception, 

628 legacy_request_spec) 

629 self._cleanup_allocated_networks( 

630 context, instance, requested_networks) 

631 

632 arq_uuids = None 

633 # arqs have not bound to port/instance yet 

634 if requested_networks: 

635 arq_uuids = [req.arq_uuid 

636 for req in requested_networks if req.arq_uuid] 

637 compute_utils.delete_arqs_if_needed(context, instance, arq_uuids) 

638 

639 # NOTE(danms): This is never cell-targeted because it is only used for 

640 # n-cpu reschedules which go to the cell conductor and thus are always 

641 # cell-specific. 

642 def build_instances(self, context, instances, image, filter_properties, 

643 admin_password, injected_files, requested_networks, 

644 security_groups, block_device_mapping=None, legacy_bdm=True, 

645 request_spec=None, host_lists=None): 

646 # TODO(ndipanov): Remove block_device_mapping and legacy_bdm in version 

647 # 2.0 of the RPC API. 

648 # TODO(danms): Remove this in version 2.0 of the RPC API 

649 if (requested_networks and 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was never true

650 not isinstance(requested_networks, 

651 objects.NetworkRequestList)): 

652 requested_networks = objects.NetworkRequestList.from_tuples( 

653 requested_networks) 

654 # TODO(melwitt): Remove this in version 2.0 of the RPC API 

655 flavor = filter_properties.get('instance_type') 

656 if flavor and not isinstance(flavor, objects.Flavor): 656 ↛ 659line 656 didn't jump to line 659 because the condition on line 656 was never true

657 # Code downstream may expect extra_specs to be populated since it 

658 # is receiving an object, so lookup the flavor to ensure this. 

659 flavor = objects.Flavor.get_by_id(context, flavor['id']) 

660 filter_properties = dict(filter_properties, instance_type=flavor) 

661 

662 # Older computes will not send a request_spec during reschedules so we 

663 # need to check and build our own if one is not provided. 

664 if request_spec is None: 

665 legacy_request_spec = scheduler_utils.build_request_spec( 

666 image, instances) 

667 else: 

668 # TODO(mriedem): This is annoying but to populate the local 

669 # request spec below using the filter_properties, we have to pass 

670 # in a primitive version of the request spec. Yes it's inefficient 

671 # and we can remove it once the populate_retry and 

672 # populate_filter_properties utility methods are converted to 

673 # work on a RequestSpec object rather than filter_properties. 

674 # NOTE(gibi): we have to keep a reference to the original 

675 # RequestSpec object passed to this function as we lose information 

676 # during the below legacy conversion 

677 legacy_request_spec = request_spec.to_legacy_request_spec_dict() 

678 

679 # 'host_lists' will be None during a reschedule from a pre-Queens 

680 # compute. In all other cases, it will be a list of lists, though the 

681 # lists may be empty if there are no more hosts left in a rescheduling 

682 # situation. 

683 is_reschedule = host_lists is not None 

684 try: 

685 # check retry policy. Rather ugly use of instances[0]... 

686 # but if we've exceeded max retries... then we really only 

687 # have a single instance. 

688 # TODO(sbauza): Provide directly the RequestSpec object 

689 # when populate_retry() accepts it 

690 scheduler_utils.populate_retry( 

691 filter_properties, instances[0].uuid) 

692 instance_uuids = [instance.uuid for instance in instances] 

693 spec_obj = objects.RequestSpec.from_primitives( 

694 context, legacy_request_spec, filter_properties) 

695 LOG.debug("Rescheduling: %s", is_reschedule) 

696 if is_reschedule: 

697 # Make sure that we have a host, as we may have exhausted all 

698 # our alternates 

699 if not host_lists[0]: 699 ↛ 702line 699 didn't jump to line 702 because the condition on line 699 was never true

700 # We have an empty list of hosts, so this instance has 

701 # failed to build. 

702 msg = ("Exhausted all hosts available for retrying build " 

703 "failures for instance %(instance_uuid)s." % 

704 {"instance_uuid": instances[0].uuid}) 

705 raise exception.MaxRetriesExceeded(reason=msg) 

706 else: 

707 # This is not a reschedule, so we need to call the scheduler to 

708 # get appropriate hosts for the request. 

709 # NOTE(gibi): We only call the scheduler if we are rescheduling 

710 # from a really old compute. In that case we do not support 

711 # externally-defined resource requests, like port QoS. So no 

712 # requested_resources are set on the RequestSpec here. 

713 host_lists = self._schedule_instances(context, spec_obj, 

714 instance_uuids, return_alternates=True) 

715 except Exception as exc: 

716 # NOTE(mriedem): If we're rescheduling from a failed build on a 

717 # compute, "retry" will be set and num_attempts will be >1 because 

718 # populate_retry above will increment it. If the server build was 

719 # forced onto a host/node or [scheduler]/max_attempts=1, "retry" 

720 # won't be in filter_properties and we won't get here because 

721 # nova-compute will just abort the build since reschedules are 

722 # disabled in those cases. 

723 num_attempts = filter_properties.get( 

724 'retry', {}).get('num_attempts', 1) 

725 for instance in instances: 

726 # If num_attempts > 1, we're in a reschedule and probably 

727 # either hit NoValidHost or MaxRetriesExceeded. Either way, 

728 # the build request should already be gone and we probably 

729 # can't reach the API DB from the cell conductor. 

730 if num_attempts <= 1: 

731 try: 

732 # If the BuildRequest stays around then instance 

733 # show/lists will pull from it rather than the errored 

734 # instance. 

735 self._destroy_build_request(context, instance) 

736 except exception.BuildRequestNotFound: 

737 pass 

738 self._cleanup_when_reschedule_fails( 

739 context, instance, exc, legacy_request_spec, 

740 requested_networks) 

741 return 

742 

743 elevated = context.elevated() 

744 for (instance, host_list) in zip(instances, host_lists): 

745 host = host_list.pop(0) 

746 if is_reschedule: 

747 # If this runs in the superconductor, the first instance will 

748 # already have its resources claimed in placement. If this is a 

749 # retry, though, this is running in the cell conductor, and we 

750 # need to claim first to ensure that the alternate host still 

751 # has its resources available. Note that there are schedulers 

752 # that don't support Placement, so must assume that the host is 

753 # still available. 

754 host_available = False 

755 while host and not host_available: 

756 if host.allocation_request: 756 ↛ 759line 756 didn't jump to line 759 because the condition on line 756 was always true

757 alloc_req = jsonutils.loads(host.allocation_request) 

758 else: 

759 alloc_req = None 

760 if alloc_req: 760 ↛ 784line 760 didn't jump to line 784 because the condition on line 760 was always true

761 try: 

762 host_available = scheduler_utils.claim_resources( 

763 elevated, self.report_client, spec_obj, 

764 instance.uuid, alloc_req, 

765 host.allocation_request_version) 

766 if request_spec and host_available: 

767 # NOTE(gibi): redo the request group - resource 

768 # provider mapping as the above claim call 

769 # moves the allocation of the instance to 

770 # another host 

771 scheduler_utils.fill_provider_mapping( 

772 request_spec, host) 

773 except Exception as exc: 

774 self._cleanup_when_reschedule_fails( 

775 context, instance, exc, legacy_request_spec, 

776 requested_networks) 

777 return 

778 else: 

779 # Some deployments use different schedulers that do not 

780 # use Placement, so they will not have an 

781 # allocation_request to claim with. For those cases, 

782 # there is no concept of claiming, so just assume that 

783 # the host is valid. 

784 host_available = True 

785 if not host_available: 

786 # Insufficient resources remain on that host, so 

787 # discard it and try the next. 

788 host = host_list.pop(0) if host_list else None 

789 if not host_available: 

790 # No more available hosts for retrying the build. 

791 msg = ("Exhausted all hosts available for retrying build " 

792 "failures for instance %(instance_uuid)s." % 

793 {"instance_uuid": instance.uuid}) 

794 exc = exception.MaxRetriesExceeded(reason=msg) 

795 self._cleanup_when_reschedule_fails( 

796 context, instance, exc, legacy_request_spec, 

797 requested_networks) 

798 return 

799 

800 # The availability_zone field was added in v1.1 of the Selection 

801 # object so make sure to handle the case where it is missing. 

802 if 'availability_zone' in host: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true

803 instance.availability_zone = host.availability_zone 

804 else: 

805 try: 

806 instance.availability_zone = ( 

807 availability_zones.get_host_availability_zone(context, 

808 host.service_host)) 

809 except Exception as exc: 

810 # Put the instance into ERROR state, set task_state to 

811 # None, inject a fault, etc. 

812 self._cleanup_when_reschedule_fails( 

813 context, instance, exc, legacy_request_spec, 

814 requested_networks) 

815 continue 

816 

817 try: 

818 # NOTE(danms): This saves the az change above, refreshes our 

819 # instance, and tells us if it has been deleted underneath us 

820 instance.save() 

821 except (exception.InstanceNotFound, 

822 exception.InstanceInfoCacheNotFound): 

823 LOG.debug('Instance deleted during build', instance=instance) 

824 continue 

825 local_filter_props = copy.deepcopy(filter_properties) 

826 scheduler_utils.populate_filter_properties(local_filter_props, 

827 host) 

828 # Populate the request_spec with the local_filter_props information 

829 # like retries and limits. Note that at this point the request_spec 

830 # could have come from a compute via reschedule and it would 

831 # already have some things set, like scheduler_hints. 

832 local_reqspec = objects.RequestSpec.from_primitives( 

833 context, legacy_request_spec, local_filter_props) 

834 

835 # NOTE(gibi): at this point the request spec already got converted 

836 # to a legacy dict and then back to an object so we lost the non 

837 # legacy part of the spec. Re-populate the requested_resources 

838 # field based on the original request spec object passed to this 

839 # function. 

840 if request_spec: 

841 local_reqspec.requested_resources = ( 

842 request_spec.requested_resources) 

843 

844 # The block_device_mapping passed from the api doesn't contain 

845 # instance specific information 

846 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid( 

847 context, instance.uuid) 

848 

849 # This is populated in scheduler_utils.populate_retry 

850 num_attempts = local_filter_props.get('retry', 

851 {}).get('num_attempts', 1) 

852 if num_attempts <= 1: 

853 # If this is a reschedule the instance is already mapped to 

854 # this cell and the BuildRequest is already deleted so ignore 

855 # the logic below. 

856 inst_mapping = self._populate_instance_mapping(context, 

857 instance, 

858 host) 

859 try: 

860 self._destroy_build_request(context, instance) 

861 except exception.BuildRequestNotFound: 

862 # This indicates an instance delete has been requested in 

863 # the API. Stop the build, cleanup the instance_mapping and 

864 # potentially the block_device_mappings 

865 # TODO(alaski): Handle block_device_mapping cleanup 

866 if inst_mapping: 

867 inst_mapping.destroy() 

868 return 

869 else: 

870 # NOTE(lyarwood): If this is a reschedule then recreate any 

871 # attachments that were previously removed when cleaning up 

872 # after failures to spawn etc. 

873 self._validate_existing_attachment_ids(context, instance, bdms) 

874 

875 alts = [(alt.service_host, alt.nodename) for alt in host_list] 

876 LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s", 

877 host.service_host, host.nodename, alts, instance=instance) 

878 

879 try: 

880 accel_uuids = self._create_and_bind_arq_for_instance( 

881 context, instance, host.nodename, local_reqspec, 

882 requested_networks) 

883 except Exception as exc: 

884 LOG.exception('Failed to reschedule. Reason: %s', exc) 

885 self._cleanup_when_reschedule_fails( 

886 context, instance, exc, legacy_request_spec, 

887 requested_networks) 

888 continue 

889 

890 self.compute_rpcapi.build_and_run_instance(context, 

891 instance=instance, host=host.service_host, image=image, 

892 request_spec=local_reqspec, 

893 filter_properties=local_filter_props, 

894 admin_password=admin_password, 

895 injected_files=injected_files, 

896 requested_networks=requested_networks, 

897 security_groups=security_groups, 

898 block_device_mapping=bdms, node=host.nodename, 

899 limits=host.limits, host_list=host_list, 

900 accel_uuids=accel_uuids) 

901 

902 def _create_and_bind_arq_for_instance( 

903 self, context, instance, hostname, 

904 request_spec, requested_networks=None): 

905 try: 

906 resource_provider_mapping = ( 

907 request_spec.get_request_group_mapping()) 

908 # Using nodename instead of hostname. See: 

909 # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa 

910 cyclient = cyborg.get_client(context) 

911 bindings = {} 

912 port_bindings = {} 

913 

914 # Create ARQs comes from extra specs. 

915 bindings = self._create_and_bind_arqs( 

916 cyclient, instance.uuid, instance.flavor.extra_specs, 

917 hostname, resource_provider_mapping) 

918 

919 if requested_networks: 

920 # Create ARQs comes from port device profile 

921 port_bindings = self._create_arqs_for_ports( 

922 cyclient, instance.uuid, requested_networks, 

923 hostname, resource_provider_mapping) 

924 

925 # Initiate Cyborg binding asynchronously 

926 bindings.update(port_bindings) 

927 if bindings: 

928 cyclient.bind_arqs(bindings) 

929 

930 return list(bindings.keys()) 

931 

932 except exception.AcceleratorRequestBindingFailed as exc: 

933 # If anything failed here we need to cleanup and bail out. 

934 cyclient = cyborg.get_client(context) 

935 cyclient.delete_arqs_by_uuid(exc.arqs) 

936 raise 

937 

938 def _schedule_instances(self, context, request_spec, 

939 instance_uuids=None, return_alternates=False): 

940 scheduler_utils.setup_instance_group(context, request_spec) 

941 with timeutils.StopWatch() as timer: 

942 host_lists = self.query_client.select_destinations( 

943 context, request_spec, instance_uuids, return_objects=True, 

944 return_alternates=return_alternates) 

945 LOG.debug('Took %0.2f seconds to select destinations for %s ' 

946 'instance(s).', timer.elapsed(), len(instance_uuids)) 

947 return host_lists 

948 

949 @staticmethod 

950 def _restrict_request_spec_to_cell(context, instance, request_spec): 

951 """Sets RequestSpec.requested_destination.cell for the move operation 

952 

953 Move operations, e.g. evacuate and unshelve, must be restricted to the 

954 cell in which the instance already exists, so this method is used to 

955 target the RequestSpec, which is sent to the scheduler via the 

956 _schedule_instances method, to the instance's current cell. 

957 

958 :param context: nova auth RequestContext 

959 """ 

960 instance_mapping = \ 

961 objects.InstanceMapping.get_by_instance_uuid( 

962 context, instance.uuid) 

963 LOG.debug('Requesting cell %(cell)s during scheduling', 

964 {'cell': instance_mapping.cell_mapping.identity}, 

965 instance=instance) 

966 if ('requested_destination' in request_spec and 

967 request_spec.requested_destination): 

968 request_spec.requested_destination.cell = ( 

969 instance_mapping.cell_mapping) 

970 else: 

971 request_spec.requested_destination = ( 

972 objects.Destination( 

973 cell=instance_mapping.cell_mapping)) 

974 

975 def _recheck_quota( 

976 self, 

977 context: nova_context.RequestContext, 

978 flavor: 'objects.Flavor', 

979 request_spec: 'objects.RequestSpec', 

980 orig_num_req: int, 

981 project_id: ty.Optional[str] = None, 

982 user_id: ty.Optional[str] = None 

983 ) -> None: 

984 # A quota "recheck" is a quota check that is performed *after* quota 

985 # limited resources are consumed. It is meant to address race 

986 # conditions where a request that was not over quota at the beginning 

987 # of the request before resources are allocated becomes over quota 

988 # after resources (like database rows or placement allocations) are 

989 # created. An example of this would be a large number of requests for 

990 # the same resource for the same project sent simultaneously. 

991 if CONF.quota.recheck_quota: 

992 # The orig_num_req is the number of instances requested, which is 

993 # the delta that was quota checked before resources were allocated. 

994 # This is only used for the exception message is the recheck fails 

995 # for lack of enough quota. 

996 compute_utils.check_num_instances_quota( 

997 context, flavor, 0, 0, project_id=project_id, 

998 user_id=user_id, orig_num_req=orig_num_req) 

999 placement_limits.enforce_num_instances_and_flavor( 

1000 context, project_id, flavor, request_spec.is_bfv, 0, 0) 

1001 

1002 # TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0. 

1003 @targets_cell 

1004 def unshelve_instance(self, context, instance, request_spec=None): 

1005 sys_meta = instance.system_metadata 

1006 

1007 def safe_image_show(ctx, image_id): 

1008 if image_id: 1008 ↛ 1011line 1008 didn't jump to line 1011 because the condition on line 1008 was always true

1009 return self.image_api.get(ctx, image_id, show_deleted=False) 

1010 else: 

1011 raise exception.ImageNotFound(image_id='') 

1012 

1013 if instance.vm_state == vm_states.SHELVED: 

1014 instance.task_state = task_states.POWERING_ON 

1015 instance.save(expected_task_state=task_states.UNSHELVING) 

1016 self.compute_rpcapi.start_instance(context, instance) 

1017 elif instance.vm_state == vm_states.SHELVED_OFFLOADED: 1017 ↛ 1155line 1017 didn't jump to line 1155 because the condition on line 1017 was always true

1018 image = None 

1019 image_id = sys_meta.get('shelved_image_id') 

1020 # No need to check for image if image_id is None as 

1021 # "shelved_image_id" key is not set for volume backed 

1022 # instance during the shelve process 

1023 if image_id: 

1024 with compute_utils.EventReporter( 

1025 context, 'get_image_info', self.host, instance.uuid): 

1026 try: 

1027 image = safe_image_show(context, image_id) 

1028 except exception.ImageNotFound as error: 

1029 instance.vm_state = vm_states.ERROR 

1030 instance.save() 

1031 

1032 reason = _('Unshelve attempted but the image %s ' 

1033 'cannot be found.') % image_id 

1034 

1035 LOG.error(reason, instance=instance) 

1036 compute_utils.add_instance_fault_from_exc( 

1037 context, instance, error, sys.exc_info(), 

1038 fault_message=reason) 

1039 raise exception.UnshelveException( 

1040 instance_id=instance.uuid, reason=reason) 

1041 

1042 try: 

1043 with compute_utils.EventReporter(context, 'schedule_instances', 

1044 self.host, instance.uuid): 

1045 # NOTE(sbauza): Force_hosts/nodes needs to be reset 

1046 # if we want to make sure that the next destination 

1047 # is not forced to be the original host 

1048 request_spec.reset_forced_destinations() 

1049 # TODO(sbauza): Provide directly the RequestSpec object 

1050 # when populate_filter_properties accepts it 

1051 filter_properties = request_spec.\ 

1052 to_legacy_filter_properties_dict() 

1053 res_req, req_lvl_params = ( 

1054 self.network_api.get_requested_resource_for_instance( 

1055 context, instance.uuid) 

1056 ) 

1057 extra_specs = request_spec.flavor.extra_specs 

1058 device_profile = extra_specs.get('accel:device_profile') 

1059 res_req.extend( 

1060 cyborg.get_device_profile_request_groups( 

1061 context, device_profile) if device_profile else []) 

1062 # NOTE(gibi): When other modules want to handle similar 

1063 # non-nova resources then here we have to collect all 

1064 # the external resource requests in a single list and 

1065 # add them to the RequestSpec. 

1066 request_spec.requested_resources = res_req 

1067 request_spec.request_level_params = req_lvl_params 

1068 

1069 # NOTE(gibi): as PCI devices is tracked in placement we 

1070 # need to generate request groups from InstancePCIRequests. 

1071 # This will append new RequestGroup objects to the 

1072 # request_spec.requested_resources list if needed 

1073 request_spec.generate_request_groups_from_pci_requests() 

1074 

1075 # NOTE(cfriesen): Ensure that we restrict the scheduler to 

1076 # the cell specified by the instance mapping. 

1077 self._restrict_request_spec_to_cell( 

1078 context, instance, request_spec) 

1079 

1080 request_spec.ensure_project_and_user_id(instance) 

1081 request_spec.ensure_network_information(instance) 

1082 compute_utils.heal_reqspec_is_bfv( 

1083 context, request_spec, instance) 

1084 host_lists = self._schedule_instances(context, 

1085 request_spec, [instance.uuid], 

1086 return_alternates=False) 

1087 

1088 # NOTE(melwitt): We recheck the quota after allocating the 

1089 # resources in placement, to prevent users from allocating 

1090 # more resources than their allowed quota in the event of a 

1091 # race. This is configurable because it can be expensive if 

1092 # strict quota limits are not required in a deployment. 

1093 try: 

1094 # Quota should only be checked for unshelve only if 

1095 # resources are being counted in placement. Legacy 

1096 # quotas continue to consume resources while 

1097 # SHELVED_OFFLOADED and will not allocate any new 

1098 # resources during unshelve. 

1099 if (CONF.quota.count_usage_from_placement or 

1100 limit_utils.use_unified_limits()): 

1101 self._recheck_quota( 

1102 context, instance.flavor, request_spec, 0, 

1103 project_id=instance.project_id, 

1104 user_id=instance.user_id) 

1105 except (exception.TooManyInstances, 

1106 limit_exceptions.ProjectOverLimit): 

1107 with excutils.save_and_reraise_exception(): 

1108 self.report_client.delete_allocation_for_instance( 

1109 context, instance.uuid, force=True) 

1110 

1111 host_list = host_lists[0] 

1112 selection = host_list[0] 

1113 scheduler_utils.populate_filter_properties( 

1114 filter_properties, selection) 

1115 (host, node) = (selection.service_host, selection.nodename) 

1116 LOG.debug( 

1117 "Scheduler selected host: %s, node:%s", 

1118 host, 

1119 node, 

1120 instance=instance 

1121 ) 

1122 instance.availability_zone = ( 

1123 availability_zones.get_host_availability_zone( 

1124 context, host)) 

1125 

1126 scheduler_utils.fill_provider_mapping( 

1127 request_spec, selection) 

1128 

1129 # NOTE(brinzhang): For unshelve operation we should 

1130 # re-create-and-bound the arqs for the instance. 

1131 accel_uuids = self._create_and_bind_arq_for_instance( 

1132 context, instance, node, request_spec) 

1133 self.compute_rpcapi.unshelve_instance( 

1134 context, instance, host, request_spec, image=image, 

1135 filter_properties=filter_properties, node=node, 

1136 accel_uuids=accel_uuids) 

1137 except (exception.NoValidHost, 

1138 exception.UnsupportedPolicyException): 

1139 instance.task_state = None 

1140 instance.save() 

1141 LOG.warning("No valid host found for unshelve instance", 

1142 instance=instance) 

1143 return 

1144 except Exception as exc: 

1145 if isinstance(exc, exception.AcceleratorRequestBindingFailed): 

1146 cyclient = cyborg.get_client(context) 

1147 cyclient.delete_arqs_by_uuid(exc.arqs) 

1148 LOG.exception('Failed to unshelve. Reason: %s', exc) 

1149 with excutils.save_and_reraise_exception(): 

1150 instance.task_state = None 

1151 instance.save() 

1152 LOG.error("Unshelve attempted but an error " 

1153 "has occurred", instance=instance) 

1154 else: 

1155 LOG.error('Unshelve attempted but vm_state not SHELVED or ' 

1156 'SHELVED_OFFLOADED', instance=instance) 

1157 instance.vm_state = vm_states.ERROR 

1158 instance.save() 

1159 return 

1160 

1161 def _allocate_for_evacuate_dest_host(self, context, instance, host, 

1162 request_spec=None): 

1163 # The user is forcing the destination host and bypassing the 

1164 # scheduler. We need to copy the source compute node 

1165 # allocations in Placement to the destination compute node. 

1166 # Normally select_destinations() in the scheduler would do this 

1167 # for us, but when forcing the target host we don't call the 

1168 # scheduler. 

1169 source_node = None # This is used for error handling below. 

1170 try: 

1171 source_node = objects.ComputeNode.get_by_host_and_nodename( 

1172 context, instance.host, instance.node) 

1173 dest_node = ( 

1174 objects.ComputeNode.get_first_node_by_host_for_old_compat( 

1175 context, host, use_slave=True)) 

1176 except exception.ComputeHostNotFound as ex: 

1177 with excutils.save_and_reraise_exception(): 

1178 self._set_vm_state_and_notify( 

1179 context, instance.uuid, 'rebuild_server', 

1180 {'vm_state': instance.vm_state, 

1181 'task_state': None}, ex, request_spec) 

1182 if source_node: 

1183 LOG.warning('Specified host %s for evacuate was not ' 

1184 'found.', host, instance=instance) 

1185 else: 

1186 LOG.warning('Source host %s and node %s for evacuate was ' 

1187 'not found.', instance.host, instance.node, 

1188 instance=instance) 

1189 

1190 try: 

1191 scheduler_utils.claim_resources_on_destination( 

1192 context, self.report_client, instance, source_node, dest_node) 

1193 except exception.NoValidHost as ex: 

1194 with excutils.save_and_reraise_exception(): 

1195 self._set_vm_state_and_notify( 

1196 context, instance.uuid, 'rebuild_server', 

1197 {'vm_state': instance.vm_state, 

1198 'task_state': None}, ex, request_spec) 

1199 LOG.warning('Specified host %s for evacuate is ' 

1200 'invalid.', host, instance=instance) 

1201 

1202 # TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0. 

1203 @targets_cell 

1204 def rebuild_instance(self, context, instance, orig_image_ref, image_ref, 

1205 injected_files, new_pass, orig_sys_metadata, 

1206 bdms, recreate, on_shared_storage, 

1207 preserve_ephemeral=False, host=None, 

1208 request_spec=None, reimage_boot_volume=False, 

1209 target_state=None): 

1210 # recreate=True means the instance is being evacuated from a failed 

1211 # host to a new destination host. The 'recreate' variable name is 

1212 # confusing, so rename it to evacuate here at the top, which is simpler 

1213 # than renaming a parameter in an RPC versioned method. 

1214 evacuate = recreate 

1215 

1216 # NOTE(efried): It would be nice if this were two separate events, one 

1217 # for 'rebuild' and one for 'evacuate', but this is part of the API 

1218 # now, so it would be nontrivial to change. 

1219 with compute_utils.EventReporter(context, 'rebuild_server', 

1220 self.host, instance.uuid): 

1221 node = limits = None 

1222 

1223 try: 

1224 migration = objects.Migration.get_by_instance_and_status( 

1225 context, instance.uuid, 'accepted') 

1226 except exception.MigrationNotFoundByStatus: 

1227 LOG.debug("No migration record for the rebuild/evacuate " 

1228 "request.", instance=instance) 

1229 migration = None 

1230 

1231 # The host variable is passed in two cases: 

1232 # 1. rebuild - the instance.host is passed to rebuild on the 

1233 # same host and bypass the scheduler *unless* a new image 

1234 # was specified 

1235 # 2. evacuate with specified host and force=True - the specified 

1236 # host is passed and is meant to bypass the scheduler. 

1237 # NOTE(mriedem): This could be a lot more straight-forward if we 

1238 # had separate methods for rebuild and evacuate... 

1239 if host: 

1240 # We only create a new allocation on the specified host if 

1241 # we're doing an evacuate since that is a move operation. 

1242 if host != instance.host: 1242 ↛ 1245line 1242 didn't jump to line 1245 because the condition on line 1242 was never true

1243 # If a destination host is forced for evacuate, create 

1244 # allocations against it in Placement. 

1245 try: 

1246 self._allocate_for_evacuate_dest_host( 

1247 context, instance, host, request_spec) 

1248 except exception.AllocationUpdateFailed as ex: 

1249 with excutils.save_and_reraise_exception(): 

1250 if migration: 

1251 migration.status = 'error' 

1252 migration.save() 

1253 # NOTE(efried): It would be nice if this were two 

1254 # separate events, one for 'rebuild' and one for 

1255 # 'evacuate', but this is part of the API now, so 

1256 # it would be nontrivial to change. 

1257 self._set_vm_state_and_notify( 

1258 context, 

1259 instance.uuid, 

1260 'rebuild_server', 

1261 {'vm_state': vm_states.ERROR, 

1262 'task_state': None}, ex, request_spec) 

1263 LOG.warning('Rebuild failed: %s', 

1264 str(ex), instance=instance) 

1265 except exception.NoValidHost: 

1266 with excutils.save_and_reraise_exception(): 

1267 if migration: 

1268 migration.status = 'error' 

1269 migration.save() 

1270 else: 

1271 # At this point, the user is either: 

1272 # 

1273 # 1. Doing a rebuild on the same host (not evacuate) and 

1274 # specified a new image. 

1275 # 2. Evacuating and specified a host but are not forcing it. 

1276 # 

1277 # In either case, the API passes host=None but sets up the 

1278 # RequestSpec.requested_destination field for the specified 

1279 # host. 

1280 if evacuate: 

1281 # NOTE(sbauza): Augment the RequestSpec object by excluding 

1282 # the source host for avoiding the scheduler to pick it 

1283 request_spec.ignore_hosts = [instance.host] 

1284 # NOTE(sbauza): Force_hosts/nodes needs to be reset 

1285 # if we want to make sure that the next destination 

1286 # is not forced to be the original host 

1287 request_spec.reset_forced_destinations() 

1288 res_req, req_lvl_params = ( 

1289 self.network_api.get_requested_resource_for_instance( 

1290 context, instance.uuid) 

1291 ) 

1292 extra_specs = request_spec.flavor.extra_specs 

1293 device_profile = extra_specs.get('accel:device_profile') 

1294 res_req.extend( 

1295 cyborg.get_device_profile_request_groups( 

1296 context, device_profile) 

1297 if device_profile else []) 

1298 # NOTE(gibi): When other modules want to handle similar 

1299 # non-nova resources then here we have to collect all 

1300 # the external resource requests in a single list and 

1301 # add them to the RequestSpec. 

1302 request_spec.requested_resources = res_req 

1303 request_spec.request_level_params = req_lvl_params 

1304 

1305 # NOTE(gibi): as PCI devices is tracked in placement we 

1306 # need to generate request groups from InstancePCIRequests. 

1307 # This will append new RequestGroup objects to the 

1308 # request_spec.requested_resources list if needed 

1309 request_spec.generate_request_groups_from_pci_requests() 

1310 

1311 try: 

1312 # if this is a rebuild of instance on the same host with 

1313 # new image. 

1314 if not evacuate and orig_image_ref != image_ref: 

1315 self._validate_image_traits_for_rebuild(context, 

1316 instance, 

1317 image_ref) 

1318 self._restrict_request_spec_to_cell( 

1319 context, instance, request_spec) 

1320 request_spec.ensure_project_and_user_id(instance) 

1321 request_spec.ensure_network_information(instance) 

1322 compute_utils.heal_reqspec_is_bfv( 

1323 context, request_spec, instance) 

1324 

1325 host_lists = self._schedule_instances(context, 

1326 request_spec, [instance.uuid], 

1327 return_alternates=False) 

1328 host_list = host_lists[0] 

1329 selection = host_list[0] 

1330 host, node, limits = (selection.service_host, 

1331 selection.nodename, selection.limits) 

1332 

1333 if recreate: 

1334 scheduler_utils.fill_provider_mapping( 

1335 request_spec, selection) 

1336 

1337 except (exception.NoValidHost, 

1338 exception.UnsupportedPolicyException, 

1339 exception.AllocationUpdateFailed, 

1340 # the next two can come from fill_provider_mapping and 

1341 # signals a software error. 

1342 NotImplementedError, 

1343 ValueError) as ex: 

1344 if migration: 

1345 migration.status = 'error' 

1346 migration.save() 

1347 # Rollback the image_ref if a new one was provided (this 

1348 # only happens in the rebuild case, not evacuate). 

1349 if orig_image_ref and orig_image_ref != image_ref: 1349 ↛ 1352line 1349 didn't jump to line 1352 because the condition on line 1349 was always true

1350 instance.image_ref = orig_image_ref 

1351 instance.save() 

1352 with excutils.save_and_reraise_exception(): 

1353 # NOTE(efried): It would be nice if this were two 

1354 # separate events, one for 'rebuild' and one for 

1355 # 'evacuate', but this is part of the API now, so it 

1356 # would be nontrivial to change. 

1357 self._set_vm_state_and_notify(context, instance.uuid, 

1358 'rebuild_server', 

1359 {'vm_state': vm_states.ERROR, 

1360 'task_state': None}, ex, request_spec) 

1361 LOG.warning('Rebuild failed: %s', 

1362 str(ex), instance=instance) 

1363 

1364 compute_utils.notify_about_instance_usage( 

1365 self.notifier, context, instance, "rebuild.scheduled") 

1366 compute_utils.notify_about_instance_rebuild( 

1367 context, instance, host, 

1368 action=fields.NotificationAction.REBUILD_SCHEDULED, 

1369 source=fields.NotificationSource.CONDUCTOR) 

1370 

1371 instance.availability_zone = ( 

1372 availability_zones.get_host_availability_zone( 

1373 context, host)) 

1374 accel_uuids = [] 

1375 try: 

1376 if instance.flavor.extra_specs.get('accel:device_profile'): 

1377 cyclient = cyborg.get_client(context) 

1378 if evacuate: 

1379 # NOTE(brinzhang): For evacuate operation we should 

1380 # delete the bound arqs, then re-create-and-bound the 

1381 # arqs for the instance. 

1382 cyclient.delete_arqs_for_instance(instance.uuid) 

1383 accel_uuids = self._create_and_bind_arq_for_instance( 

1384 context, instance, node, request_spec) 

1385 else: 

1386 accel_uuids = cyclient.get_arq_uuids_for_instance( 

1387 instance) 

1388 except Exception as exc: 

1389 if isinstance(exc, exception.AcceleratorRequestBindingFailed): 1389 ↛ 1392line 1389 didn't jump to line 1392 because the condition on line 1389 was always true

1390 cyclient = cyborg.get_client(context) 

1391 cyclient.delete_arqs_by_uuid(exc.arqs) 

1392 LOG.exception('Failed to rebuild. Reason: %s', exc) 

1393 raise exc 

1394 

1395 self.compute_rpcapi.rebuild_instance( 

1396 context, 

1397 instance=instance, 

1398 new_pass=new_pass, 

1399 injected_files=injected_files, 

1400 image_ref=image_ref, 

1401 orig_image_ref=orig_image_ref, 

1402 orig_sys_metadata=orig_sys_metadata, 

1403 bdms=bdms, 

1404 recreate=evacuate, 

1405 on_shared_storage=on_shared_storage, 

1406 preserve_ephemeral=preserve_ephemeral, 

1407 migration=migration, 

1408 host=host, 

1409 node=node, 

1410 limits=limits, 

1411 request_spec=request_spec, 

1412 accel_uuids=accel_uuids, 

1413 reimage_boot_volume=reimage_boot_volume, 

1414 target_state=target_state) 

1415 

1416 def _validate_image_traits_for_rebuild(self, context, instance, image_ref): 

1417 """Validates that the traits specified in the image can be satisfied 

1418 by the providers of the current allocations for the instance during 

1419 rebuild of the instance. If the traits cannot be 

1420 satisfied, fails the action by raising a NoValidHost exception. 

1421 

1422 :raises: NoValidHost exception in case the traits on the providers 

1423 of the allocated resources for the instance do not match 

1424 the required traits on the image. 

1425 """ 

1426 image_meta = objects.ImageMeta.from_image_ref( 

1427 context, self.image_api, image_ref) 

1428 if ('properties' not in image_meta or 1428 ↛ 1433line 1428 didn't jump to line 1433 because the condition on line 1428 was always true

1429 'traits_required' not in image_meta.properties or not 

1430 image_meta.properties.traits_required): 

1431 return 

1432 

1433 image_traits = set(image_meta.properties.traits_required) 

1434 

1435 # check any of the image traits are forbidden in flavor traits. 

1436 # if so raise an exception 

1437 extra_specs = instance.flavor.extra_specs 

1438 forbidden_flavor_traits = set() 

1439 for key, val in extra_specs.items(): 

1440 if key.startswith('trait'): 

1441 # get the actual key. 

1442 prefix, parsed_key = key.split(':', 1) 

1443 if val == 'forbidden': 

1444 forbidden_flavor_traits.add(parsed_key) 

1445 

1446 forbidden_traits = image_traits & forbidden_flavor_traits 

1447 

1448 if forbidden_traits: 

1449 raise exception.NoValidHost( 

1450 reason=_("Image traits are part of forbidden " 

1451 "traits in flavor associated with the server. " 

1452 "Either specify a different image during rebuild " 

1453 "or create a new server with the specified image " 

1454 "and a compatible flavor.")) 

1455 

1456 # If image traits are present, then validate against allocations. 

1457 allocations = self.report_client.get_allocations_for_consumer( 

1458 context, instance.uuid) 

1459 instance_rp_uuids = list(allocations) 

1460 

1461 # Get provider tree for the instance. We use the uuid of the host 

1462 # on which the instance is rebuilding to get the provider tree. 

1463 compute_node = objects.ComputeNode.get_by_host_and_nodename( 

1464 context, instance.host, instance.node) 

1465 

1466 # TODO(karimull): Call with a read-only version, when available. 

1467 instance_rp_tree = ( 

1468 self.report_client.get_provider_tree_and_ensure_root( 

1469 context, compute_node.uuid)) 

1470 

1471 traits_in_instance_rps = set() 

1472 

1473 for rp_uuid in instance_rp_uuids: 

1474 traits_in_instance_rps.update( 

1475 instance_rp_tree.data(rp_uuid).traits) 

1476 

1477 missing_traits = image_traits - traits_in_instance_rps 

1478 

1479 if missing_traits: 

1480 raise exception.NoValidHost( 

1481 reason=_("Image traits cannot be " 

1482 "satisfied by the current resource providers. " 

1483 "Either specify a different image during rebuild " 

1484 "or create a new server with the specified image.")) 

1485 

1486 # TODO(avolkov): move method to bdm 

1487 @staticmethod 

1488 def _volume_size(flavor, bdm): 

1489 size = bdm.get('volume_size') 

1490 # NOTE (ndipanov): inherit flavor size only for swap and ephemeral 

1491 if (size is None and bdm.get('source_type') == 'blank' and 1491 ↛ 1493line 1491 didn't jump to line 1493 because the condition on line 1491 was never true

1492 bdm.get('destination_type') == 'local'): 

1493 if bdm.get('guest_format') == 'swap': 

1494 size = flavor.get('swap', 0) 

1495 else: 

1496 size = flavor.get('ephemeral_gb', 0) 

1497 return size 

1498 

1499 def _create_block_device_mapping(self, cell, flavor, instance_uuid, 

1500 block_device_mapping): 

1501 """Create the BlockDeviceMapping objects in the db. 

1502 

1503 This method makes a copy of the list in order to avoid using the same 

1504 id field in case this is called for multiple instances. 

1505 """ 

1506 LOG.debug("block_device_mapping %s", list(block_device_mapping), 

1507 instance_uuid=instance_uuid) 

1508 instance_block_device_mapping = copy.deepcopy(block_device_mapping) 

1509 for bdm in instance_block_device_mapping: 

1510 bdm.volume_size = self._volume_size(flavor, bdm) 

1511 bdm.instance_uuid = instance_uuid 

1512 with obj_target_cell(bdm, cell): 

1513 bdm.update_or_create() 

1514 return instance_block_device_mapping 

1515 

1516 def _create_tags(self, context, instance_uuid, tags): 

1517 """Create the Tags objects in the db.""" 

1518 if tags: 

1519 tag_list = [tag.tag for tag in tags] 

1520 instance_tags = objects.TagList.create( 

1521 context, instance_uuid, tag_list) 

1522 return instance_tags 

1523 else: 

1524 return tags 

1525 

1526 def _create_instance_action_for_cell0(self, context, instance, exc): 

1527 """Create a failed "create" instance action for the instance in cell0. 

1528 

1529 :param context: nova auth RequestContext targeted at cell0 

1530 :param instance: Instance object being buried in cell0 

1531 :param exc: Exception that occurred which resulted in burial 

1532 """ 

1533 # First create the action record. 

1534 objects.InstanceAction.action_start( 

1535 context, instance.uuid, instance_actions.CREATE, want_result=False) 

1536 # Now create an event for that action record. 

1537 event_name = 'conductor_schedule_and_build_instances' 

1538 objects.InstanceActionEvent.event_start( 

1539 context, instance.uuid, event_name, want_result=False, 

1540 host=self.host) 

1541 # And finish the event with the exception. Note that we expect this 

1542 # method to be called from _bury_in_cell0 which is called from within 

1543 # an exception handler so sys.exc_info should return values but if not 

1544 # it's not the end of the world - this is best effort. 

1545 objects.InstanceActionEvent.event_finish_with_failure( 

1546 context, instance.uuid, event_name, exc_val=exc, 

1547 exc_tb=sys.exc_info()[2], want_result=False) 

1548 

1549 def _bury_in_cell0(self, context, request_spec, exc, 

1550 build_requests=None, instances=None, 

1551 block_device_mapping=None, 

1552 tags=None): 

1553 """Ensure all provided build_requests and instances end up in cell0. 

1554 

1555 Cell0 is the fake cell we schedule dead instances to when we can't 

1556 schedule them somewhere real. Requests that don't yet have instances 

1557 will get a new instance, created in cell0. Instances that have not yet 

1558 been created will be created in cell0. All build requests are destroyed 

1559 after we're done. Failure to delete a build request will trigger the 

1560 instance deletion, just like the happy path in 

1561 schedule_and_build_instances() below. 

1562 """ 

1563 try: 

1564 cell0 = objects.CellMapping.get_by_uuid( 

1565 context, objects.CellMapping.CELL0_UUID) 

1566 except exception.CellMappingNotFound: 

1567 # Not yet setup for cellsv2. Instances will need to be written 

1568 # to the configured database. This will become a deployment 

1569 # error in Ocata. 

1570 LOG.error('No cell mapping found for cell0 while ' 

1571 'trying to record scheduling failure. ' 

1572 'Setup is incomplete.') 

1573 return 

1574 

1575 build_requests = build_requests or [] 

1576 instances = instances or [] 

1577 instances_by_uuid = {inst.uuid: inst for inst in instances} 

1578 for build_request in build_requests: 

1579 if build_request.instance_uuid not in instances_by_uuid: 

1580 # This is an instance object with no matching db entry. 

1581 instance = build_request.get_new_instance(context) 

1582 instances_by_uuid[instance.uuid] = instance 

1583 

1584 updates = {'vm_state': vm_states.ERROR, 'task_state': None} 

1585 for instance in instances_by_uuid.values(): 

1586 

1587 inst_mapping = None 

1588 try: 

1589 # We don't need the cell0-targeted context here because the 

1590 # instance mapping is in the API DB. 

1591 inst_mapping = \ 

1592 objects.InstanceMapping.get_by_instance_uuid( 

1593 context, instance.uuid) 

1594 except exception.InstanceMappingNotFound: 

1595 # The API created the instance mapping record so it should 

1596 # definitely be here. Log an error but continue to create the 

1597 # instance in the cell0 database. 

1598 LOG.error('While burying instance in cell0, no instance ' 

1599 'mapping was found.', instance=instance) 

1600 

1601 # Perform a final sanity check that the instance is not mapped 

1602 # to some other cell already because of maybe some crazy 

1603 # clustered message queue weirdness. 

1604 if inst_mapping and inst_mapping.cell_mapping is not None: 

1605 LOG.error('When attempting to bury instance in cell0, the ' 

1606 'instance is already mapped to cell %s. Ignoring ' 

1607 'bury in cell0 attempt.', 

1608 inst_mapping.cell_mapping.identity, 

1609 instance=instance) 

1610 continue 

1611 

1612 with obj_target_cell(instance, cell0) as cctxt: 

1613 instance.create() 

1614 if inst_mapping: 

1615 inst_mapping.cell_mapping = cell0 

1616 inst_mapping.save() 

1617 

1618 # Record an instance action with a failed event. 

1619 self._create_instance_action_for_cell0( 

1620 cctxt, instance, exc) 

1621 

1622 # NOTE(mnaser): In order to properly clean-up volumes after 

1623 # being buried in cell0, we need to store BDMs. 

1624 if block_device_mapping: 

1625 self._create_block_device_mapping( 

1626 cell0, instance.flavor, instance.uuid, 

1627 block_device_mapping) 

1628 

1629 self._create_tags(cctxt, instance.uuid, tags) 

1630 

1631 # Use the context targeted to cell0 here since the instance is 

1632 # now in cell0. 

1633 self._set_vm_state_and_notify( 

1634 cctxt, instance.uuid, 'build_instances', updates, 

1635 exc, request_spec) 

1636 

1637 for build_request in build_requests: 

1638 try: 

1639 build_request.destroy() 

1640 except exception.BuildRequestNotFound: 

1641 # Instance was deleted before we finished scheduling 

1642 inst = instances_by_uuid[build_request.instance_uuid] 

1643 with obj_target_cell(inst, cell0): 

1644 inst.destroy() 

1645 

1646 def schedule_and_build_instances(self, context, build_requests, 

1647 request_specs, image, 

1648 admin_password, injected_files, 

1649 requested_networks, block_device_mapping, 

1650 tags=None): 

1651 # Add all the UUIDs for the instances 

1652 instance_uuids = [spec.instance_uuid for spec in request_specs] 

1653 try: 

1654 host_lists = self._schedule_instances(context, request_specs[0], 

1655 instance_uuids, return_alternates=True) 

1656 except Exception as exc: 

1657 LOG.exception('Failed to schedule instances') 

1658 self._bury_in_cell0(context, request_specs[0], exc, 

1659 build_requests=build_requests, 

1660 block_device_mapping=block_device_mapping, 

1661 tags=tags) 

1662 return 

1663 

1664 host_mapping_cache = {} 

1665 cell_mapping_cache = {} 

1666 instances = [] 

1667 host_az = {} # host=az cache to optimize multi-create 

1668 

1669 for (build_request, request_spec, host_list) in zip( 

1670 build_requests, request_specs, host_lists): 

1671 instance = build_request.get_new_instance(context) 

1672 # host_list is a list of one or more Selection objects, the first 

1673 # of which has been selected and its resources claimed. 

1674 host = host_list[0] 

1675 # Convert host from the scheduler into a cell record 

1676 if host.service_host not in host_mapping_cache: 

1677 try: 

1678 host_mapping = objects.HostMapping.get_by_host( 

1679 context, host.service_host) 

1680 host_mapping_cache[host.service_host] = host_mapping 

1681 except exception.HostMappingNotFound as exc: 

1682 LOG.error('No host-to-cell mapping found for selected ' 

1683 'host %(host)s. Setup is incomplete.', 

1684 {'host': host.service_host}) 

1685 self._bury_in_cell0( 

1686 context, request_spec, exc, 

1687 build_requests=[build_request], instances=[instance], 

1688 block_device_mapping=block_device_mapping, 

1689 tags=tags) 

1690 # This is a placeholder in case the quota recheck fails. 

1691 instances.append(None) 

1692 continue 

1693 else: 

1694 host_mapping = host_mapping_cache[host.service_host] 

1695 

1696 cell = host_mapping.cell_mapping 

1697 

1698 # Before we create the instance, let's make one final check that 

1699 # the build request is still around and wasn't deleted by the user 

1700 # already. 

1701 try: 

1702 objects.BuildRequest.get_by_instance_uuid( 

1703 context, instance.uuid) 

1704 except exception.BuildRequestNotFound: 

1705 # the build request is gone so we're done for this instance 

1706 LOG.debug('While scheduling instance, the build request ' 

1707 'was already deleted.', instance=instance) 

1708 # This is a placeholder in case the quota recheck fails. 

1709 instances.append(None) 

1710 # If the build request was deleted and the instance is not 

1711 # going to be created, there is on point in leaving an orphan 

1712 # instance mapping so delete it. 

1713 try: 

1714 im = objects.InstanceMapping.get_by_instance_uuid( 

1715 context, instance.uuid) 

1716 im.destroy() 

1717 except exception.InstanceMappingNotFound: 

1718 pass 

1719 self.report_client.delete_allocation_for_instance( 

1720 context, instance.uuid, force=True) 

1721 continue 

1722 else: 

1723 if host.service_host not in host_az: 

1724 host_az[host.service_host] = ( 

1725 availability_zones.get_host_availability_zone( 

1726 context, host.service_host)) 

1727 instance.availability_zone = host_az[host.service_host] 

1728 with obj_target_cell(instance, cell): 

1729 instance.create() 

1730 instances.append(instance) 

1731 cell_mapping_cache[instance.uuid] = cell 

1732 

1733 # NOTE(melwitt): We recheck the quota after allocating the 

1734 # resources to prevent users from allocating more resources 

1735 # than their allowed quota in the event of a race. This is 

1736 # configurable because it can be expensive if strict quota 

1737 # limits are not required in a deployment. 

1738 try: 

1739 self._recheck_quota(context, instance.flavor, request_specs[0], 

1740 len(build_requests), project_id=instance.project_id, 

1741 user_id=instance.user_id 

1742 ) 

1743 except (exception.TooManyInstances, 

1744 limit_exceptions.ProjectOverLimit) as exc: 

1745 with excutils.save_and_reraise_exception(): 

1746 self._cleanup_build_artifacts( 

1747 context, exc, instances, build_requests, request_specs, 

1748 block_device_mapping, tags, cell_mapping_cache) 

1749 

1750 zipped = zip(build_requests, request_specs, host_lists, instances) 

1751 for (build_request, request_spec, host_list, instance) in zipped: 

1752 if instance is None: 

1753 # Skip placeholders that were buried in cell0 or had their 

1754 # build requests deleted by the user before instance create. 

1755 continue 

1756 cell = cell_mapping_cache[instance.uuid] 

1757 # host_list is a list of one or more Selection objects, the first 

1758 # of which has been selected and its resources claimed. 

1759 host = host_list.pop(0) 

1760 alts = [(alt.service_host, alt.nodename) for alt in host_list] 

1761 LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s", 

1762 host.service_host, host.nodename, alts, instance=instance) 

1763 filter_props = request_spec.to_legacy_filter_properties_dict() 

1764 scheduler_utils.populate_retry(filter_props, instance.uuid) 

1765 scheduler_utils.populate_filter_properties(filter_props, 

1766 host) 

1767 

1768 # Now that we have a selected host (which has claimed resource 

1769 # allocations in the scheduler) for this instance, we may need to 

1770 # map allocations to resource providers in the request spec. 

1771 try: 

1772 scheduler_utils.fill_provider_mapping(request_spec, host) 

1773 except Exception as exc: 

1774 # If anything failed here we need to cleanup and bail out. 

1775 with excutils.save_and_reraise_exception(): 

1776 self._cleanup_build_artifacts( 

1777 context, exc, instances, build_requests, request_specs, 

1778 block_device_mapping, tags, cell_mapping_cache) 

1779 

1780 # TODO(melwitt): Maybe we should set_target_cell on the contexts 

1781 # once we map to a cell, and remove these separate with statements. 

1782 with obj_target_cell(instance, cell) as cctxt: 

1783 # send a state update notification for the initial create to 

1784 # show it going from non-existent to BUILDING 

1785 # This can lazy-load attributes on instance. 

1786 notifications.send_update_with_states(cctxt, instance, None, 

1787 vm_states.BUILDING, None, None, service="conductor") 

1788 objects.InstanceAction.action_start( 

1789 cctxt, instance.uuid, instance_actions.CREATE, 

1790 want_result=False) 

1791 instance_bdms = self._create_block_device_mapping( 

1792 cell, instance.flavor, instance.uuid, block_device_mapping) 

1793 instance_tags = self._create_tags(cctxt, instance.uuid, tags) 

1794 

1795 # TODO(Kevin Zheng): clean this up once instance.create() handles 

1796 # tags; we do this so the instance.create notification in 

1797 # build_and_run_instance in nova-compute doesn't lazy-load tags 

1798 instance.tags = instance_tags if instance_tags \ 

1799 else objects.TagList() 

1800 

1801 # Update mapping for instance. 

1802 self._map_instance_to_cell(context, instance, cell) 

1803 

1804 if not self._delete_build_request( 

1805 context, build_request, instance, cell, instance_bdms, 

1806 instance_tags): 

1807 # The build request was deleted before/during scheduling so 

1808 # the instance is gone and we don't have anything to build for 

1809 # this one. 

1810 continue 

1811 

1812 try: 

1813 accel_uuids = self._create_and_bind_arq_for_instance( 

1814 context, instance, host.nodename, request_spec, 

1815 requested_networks) 

1816 except Exception as exc: 

1817 with excutils.save_and_reraise_exception(): 

1818 self._cleanup_build_artifacts( 

1819 context, exc, instances, build_requests, request_specs, 

1820 block_device_mapping, tags, cell_mapping_cache) 

1821 

1822 # NOTE(danms): Compute RPC expects security group names or ids 

1823 # not objects, so convert this to a list of names until we can 

1824 # pass the objects. 

1825 legacy_secgroups = [s.identifier 

1826 for s in request_spec.security_groups] 

1827 with obj_target_cell(instance, cell) as cctxt: 

1828 self.compute_rpcapi.build_and_run_instance( 

1829 cctxt, instance=instance, image=image, 

1830 request_spec=request_spec, 

1831 filter_properties=filter_props, 

1832 admin_password=admin_password, 

1833 injected_files=injected_files, 

1834 requested_networks=requested_networks, 

1835 security_groups=legacy_secgroups, 

1836 block_device_mapping=instance_bdms, 

1837 host=host.service_host, node=host.nodename, 

1838 limits=host.limits, host_list=host_list, 

1839 accel_uuids=accel_uuids) 

1840 

1841 def _create_and_bind_arqs( 

1842 self, cyclient, instance_uuid, extra_specs, 

1843 hostname, resource_provider_mapping): 

1844 """Create ARQs comes from extra specs, determine their RPs. 

1845 

1846 The binding is asynchronous; Cyborg will notify on completion. 

1847 The notification will be handled in the compute manager. 

1848 """ 

1849 arqs = [] 

1850 bindings = {} 

1851 dp_name = extra_specs.get('accel:device_profile') 

1852 

1853 # profiles from request spec: Create ARQ and binding 

1854 if not dp_name: 

1855 # empty arq list and binding info 

1856 return bindings 

1857 

1858 LOG.debug('Calling Cyborg to get ARQs. dp_name=%s instance=%s', 

1859 dp_name, instance_uuid) 

1860 arqs = cyclient.create_arqs_and_match_resource_providers( 

1861 dp_name, resource_provider_mapping) 

1862 LOG.debug('Got ARQs with resource provider mapping %s', arqs) 

1863 bindings = { 

1864 arq['uuid']: 

1865 {"hostname": hostname, 

1866 "device_rp_uuid": arq['device_rp_uuid'], 

1867 "instance_uuid": instance_uuid 

1868 } 

1869 for arq in arqs} 

1870 

1871 return bindings 

1872 

1873 def _create_arqs_for_ports(self, cyclient, instance_uuid, 

1874 requested_networks, 

1875 hostname, resource_provider_mapping): 

1876 """Create ARQs for port with backend device profile. 

1877 

1878 The binding is asynchronous; Cyborg will notify on completion. 

1879 The notification will be handled in the compute manager. 

1880 """ 

1881 bindings = {} 

1882 

1883 for request_net in requested_networks: 

1884 if request_net.port_id and request_net.device_profile: 1884 ↛ 1883line 1884 didn't jump to line 1883 because the condition on line 1884 was always true

1885 device_profile = request_net.device_profile 

1886 # the port doesn't support multiple devices 

1887 arqs = cyclient.create_arqs(device_profile) 

1888 if len(arqs) > 1: 

1889 raise exception.AcceleratorRequestOpFailed( 

1890 op=_('create'), 

1891 msg='the port does not support multiple devices.') 

1892 arq = arqs[0] 

1893 

1894 LOG.debug("Create ARQ %s for port %s of instance %s", 

1895 arq["uuid"], request_net.port_id, instance_uuid) 

1896 request_net.arq_uuid = arq["uuid"] 

1897 

1898 rp_uuid = cyclient.get_arq_device_rp_uuid( 

1899 arq, 

1900 resource_provider_mapping, 

1901 request_net.port_id) 

1902 

1903 arq_binding = {request_net.arq_uuid: 

1904 {"hostname": hostname, 

1905 "device_rp_uuid": rp_uuid, 

1906 "instance_uuid": instance_uuid} 

1907 } 

1908 LOG.debug("ARQ %s binding: %s", request_net.arq_uuid, 

1909 arq_binding) 

1910 bindings.update(arq_binding) 

1911 

1912 return bindings 

1913 

1914 @staticmethod 

1915 def _map_instance_to_cell(context, instance, cell): 

1916 """Update the instance mapping to point at the given cell. 

1917 

1918 During initial scheduling once a host and cell is selected in which 

1919 to build the instance this method is used to update the instance 

1920 mapping to point at that cell. 

1921 

1922 :param context: nova auth RequestContext 

1923 :param instance: Instance object being built 

1924 :param cell: CellMapping representing the cell in which the instance 

1925 was created and is being built. 

1926 :returns: InstanceMapping object that was updated. 

1927 """ 

1928 inst_mapping = objects.InstanceMapping.get_by_instance_uuid( 

1929 context, instance.uuid) 

1930 # Perform a final sanity check that the instance is not mapped 

1931 # to some other cell already because of maybe some crazy 

1932 # clustered message queue weirdness. 

1933 if inst_mapping.cell_mapping is not None: 

1934 LOG.error('During scheduling instance is already mapped to ' 

1935 'another cell: %s. This should not happen and is an ' 

1936 'indication of bigger problems. If you see this you ' 

1937 'should report it to the nova team. Overwriting ' 

1938 'the mapping to point at cell %s.', 

1939 inst_mapping.cell_mapping.identity, cell.identity, 

1940 instance=instance) 

1941 inst_mapping.cell_mapping = cell 

1942 inst_mapping.save() 

1943 return inst_mapping 

1944 

1945 def _cleanup_build_artifacts(self, context, exc, instances, build_requests, 

1946 request_specs, block_device_mappings, tags, 

1947 cell_mapping_cache): 

1948 for (instance, build_request, request_spec) in zip( 

1949 instances, build_requests, request_specs): 

1950 # Skip placeholders that were buried in cell0 or had their 

1951 # build requests deleted by the user before instance create. 

1952 if instance is None: 1952 ↛ 1953line 1952 didn't jump to line 1953 because the condition on line 1952 was never true

1953 continue 

1954 updates = {'vm_state': vm_states.ERROR, 'task_state': None} 

1955 cell = cell_mapping_cache[instance.uuid] 

1956 with try_target_cell(context, cell) as cctxt: 

1957 self._set_vm_state_and_notify(cctxt, instance.uuid, 

1958 'build_instances', updates, exc, 

1959 request_spec) 

1960 

1961 # In order to properly clean-up volumes when deleting a server in 

1962 # ERROR status with no host, we need to store BDMs in the same 

1963 # cell. 

1964 if block_device_mappings: 1964 ↛ 1972line 1964 didn't jump to line 1972 because the condition on line 1964 was always true

1965 self._create_block_device_mapping( 

1966 cell, instance.flavor, instance.uuid, 

1967 block_device_mappings) 

1968 

1969 # Like BDMs, the server tags provided by the user when creating the 

1970 # server should be persisted in the same cell so they can be shown 

1971 # from the API. 

1972 if tags: 1972 ↛ 1983line 1972 didn't jump to line 1983 because the condition on line 1972 was always true

1973 with nova_context.target_cell(context, cell) as cctxt: 

1974 self._create_tags(cctxt, instance.uuid, tags) 

1975 

1976 # NOTE(mdbooth): To avoid an incomplete instance record being 

1977 # returned by the API, the instance mapping must be 

1978 # created after the instance record is complete in 

1979 # the cell, and before the build request is 

1980 # destroyed. 

1981 # TODO(mnaser): The cell mapping should already be populated by 

1982 # this point to avoid setting it below here. 

1983 inst_mapping = objects.InstanceMapping.get_by_instance_uuid( 

1984 context, instance.uuid) 

1985 inst_mapping.cell_mapping = cell 

1986 inst_mapping.save() 

1987 

1988 # Be paranoid about artifacts being deleted underneath us. 

1989 try: 

1990 build_request.destroy() 

1991 except exception.BuildRequestNotFound: 

1992 pass 

1993 try: 

1994 request_spec.destroy() 

1995 except exception.RequestSpecNotFound: 

1996 pass 

1997 

1998 def _delete_build_request(self, context, build_request, instance, cell, 

1999 instance_bdms, instance_tags): 

2000 """Delete a build request after creating the instance in the cell. 

2001 

2002 This method handles cleaning up the instance in case the build request 

2003 is already deleted by the time we try to delete it. 

2004 

2005 :param context: the context of the request being handled 

2006 :type context: nova.context.RequestContext 

2007 :param build_request: the build request to delete 

2008 :type build_request: nova.objects.BuildRequest 

2009 :param instance: the instance created from the build_request 

2010 :type instance: nova.objects.Instance 

2011 :param cell: the cell in which the instance was created 

2012 :type cell: nova.objects.CellMapping 

2013 :param instance_bdms: list of block device mappings for the instance 

2014 :type instance_bdms: nova.objects.BlockDeviceMappingList 

2015 :param instance_tags: list of tags for the instance 

2016 :type instance_tags: nova.objects.TagList 

2017 :returns: True if the build request was successfully deleted, False if 

2018 the build request was already deleted and the instance is now gone. 

2019 """ 

2020 try: 

2021 build_request.destroy() 

2022 except exception.BuildRequestNotFound: 

2023 # This indicates an instance deletion request has been 

2024 # processed, and the build should halt here. Clean up the 

2025 # bdm, tags and instance record. 

2026 with obj_target_cell(instance, cell) as cctxt: 

2027 with compute_utils.notify_about_instance_delete( 

2028 self.notifier, cctxt, instance, 

2029 source=fields.NotificationSource.CONDUCTOR): 

2030 try: 

2031 instance.destroy() 

2032 except exception.InstanceNotFound: 

2033 pass 

2034 except exception.ObjectActionError: 

2035 # NOTE(melwitt): Instance became scheduled during 

2036 # the destroy, "host changed". Refresh and re-destroy. 

2037 try: 

2038 instance.refresh() 

2039 instance.destroy() 

2040 except exception.InstanceNotFound: 

2041 pass 

2042 for bdm in instance_bdms: 

2043 with obj_target_cell(bdm, cell): 

2044 try: 

2045 bdm.destroy() 

2046 except exception.ObjectActionError: 

2047 pass 

2048 if instance_tags: 2048 ↛ 2054line 2048 didn't jump to line 2054 because the condition on line 2048 was always true

2049 with try_target_cell(context, cell) as target_ctxt: 

2050 try: 

2051 objects.TagList.destroy(target_ctxt, instance.uuid) 

2052 except exception.InstanceNotFound: 

2053 pass 

2054 return False 

2055 return True 

2056 

2057 def cache_images(self, context, aggregate, image_ids): 

2058 """Cache a set of images on the set of hosts in an aggregate. 

2059 

2060 :param context: The RequestContext 

2061 :param aggregate: The Aggregate object from the request to constrain 

2062 the host list 

2063 :param image_id: The IDs of the image to cache 

2064 """ 

2065 

2066 # TODO(mriedem): Consider including the list of images in the 

2067 # notification payload. 

2068 compute_utils.notify_about_aggregate_action( 

2069 context, aggregate, 

2070 fields.NotificationAction.IMAGE_CACHE, 

2071 fields.NotificationPhase.START) 

2072 

2073 clock = timeutils.StopWatch() 

2074 threads = CONF.image_cache.precache_concurrency 

2075 fetch_pool = eventlet.GreenPool(size=threads) 

2076 

2077 hosts_by_cell = {} 

2078 cells_by_uuid = {} 

2079 # TODO(danms): Make this a much more efficient bulk query 

2080 for hostname in aggregate.hosts: 

2081 hmap = objects.HostMapping.get_by_host(context, hostname) 

2082 cells_by_uuid.setdefault(hmap.cell_mapping.uuid, hmap.cell_mapping) 

2083 hosts_by_cell.setdefault(hmap.cell_mapping.uuid, []) 

2084 hosts_by_cell[hmap.cell_mapping.uuid].append(hostname) 

2085 

2086 LOG.info('Preparing to request pre-caching of image(s) %(image_ids)s ' 

2087 'on %(hosts)i hosts across %(cells)i cells.', 

2088 {'image_ids': ','.join(image_ids), 

2089 'hosts': len(aggregate.hosts), 

2090 'cells': len(hosts_by_cell)}) 

2091 clock.start() 

2092 

2093 stats = collections.defaultdict(lambda: (0, 0, 0, 0)) 

2094 failed_images = collections.defaultdict(int) 

2095 down_hosts = set() 

2096 host_stats = { 

2097 'completed': 0, 

2098 'total': len(aggregate.hosts), 

2099 } 

2100 

2101 def host_completed(context, host, result): 

2102 for image_id, status in result.items(): 

2103 cached, existing, error, unsupported = stats[image_id] 

2104 if status == 'error': 

2105 failed_images[image_id] += 1 

2106 error += 1 

2107 elif status == 'cached': 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true

2108 cached += 1 

2109 elif status == 'existing': 2109 ↛ 2110line 2109 didn't jump to line 2110 because the condition on line 2109 was never true

2110 existing += 1 

2111 elif status == 'unsupported': 

2112 unsupported += 1 

2113 stats[image_id] = (cached, existing, error, unsupported) 

2114 

2115 host_stats['completed'] += 1 

2116 compute_utils.notify_about_aggregate_cache(context, aggregate, 

2117 host, result, 

2118 host_stats['completed'], 

2119 host_stats['total']) 

2120 

2121 def wrap_cache_images(ctxt, host, image_ids): 

2122 result = self.compute_rpcapi.cache_images( 

2123 ctxt, 

2124 host=host, 

2125 image_ids=image_ids) 

2126 host_completed(context, host, result) 

2127 

2128 def skipped_host(context, host, image_ids): 

2129 result = {image: 'skipped' for image in image_ids} 

2130 host_completed(context, host, result) 

2131 

2132 for cell_uuid, hosts in hosts_by_cell.items(): 

2133 cell = cells_by_uuid[cell_uuid] 

2134 with nova_context.target_cell(context, cell) as target_ctxt: 

2135 for host in hosts: 

2136 service = objects.Service.get_by_compute_host(target_ctxt, 

2137 host) 

2138 if not self.servicegroup_api.service_is_up(service): 

2139 down_hosts.add(host) 

2140 LOG.info( 

2141 'Skipping image pre-cache request to compute ' 

2142 '%(host)r because it is not up', 

2143 {'host': host}) 

2144 skipped_host(target_ctxt, host, image_ids) 

2145 continue 

2146 

2147 utils.pass_context(fetch_pool.spawn_n, wrap_cache_images, 

2148 target_ctxt, host, image_ids) 

2149 

2150 # Wait until all those things finish 

2151 fetch_pool.waitall() 

2152 

2153 overall_stats = {'cached': 0, 'existing': 0, 'error': 0, 

2154 'unsupported': 0} 

2155 for cached, existing, error, unsupported in stats.values(): 

2156 overall_stats['cached'] += cached 

2157 overall_stats['existing'] += existing 

2158 overall_stats['error'] += error 

2159 overall_stats['unsupported'] += unsupported 

2160 

2161 clock.stop() 

2162 LOG.info('Image pre-cache operation for image(s) %(image_ids)s ' 

2163 'completed in %(time).2f seconds; ' 

2164 '%(cached)i cached, %(existing)i existing, %(error)i errors, ' 

2165 '%(unsupported)i unsupported, %(skipped)i skipped (down) ' 

2166 'hosts', 

2167 {'image_ids': ','.join(image_ids), 

2168 'time': clock.elapsed(), 

2169 'cached': overall_stats['cached'], 

2170 'existing': overall_stats['existing'], 

2171 'error': overall_stats['error'], 

2172 'unsupported': overall_stats['unsupported'], 

2173 'skipped': len(down_hosts), 

2174 }) 

2175 # Log error'd images specifically at warning level 

2176 for image_id, fails in failed_images.items(): 

2177 LOG.warning('Image pre-cache operation for image %(image)s ' 

2178 'failed %(fails)i times', 

2179 {'image': image_id, 

2180 'fails': fails}) 

2181 

2182 compute_utils.notify_about_aggregate_action( 

2183 context, aggregate, 

2184 fields.NotificationAction.IMAGE_CACHE, 

2185 fields.NotificationPhase.END) 

2186 

2187 @targets_cell 

2188 @wrap_instance_event(prefix='conductor') 

2189 def confirm_snapshot_based_resize(self, context, instance, migration): 

2190 """Executes the ConfirmResizeTask 

2191 

2192 :param context: nova auth request context targeted at the target cell 

2193 :param instance: Instance object in "resized" status from the target 

2194 cell 

2195 :param migration: Migration object from the target cell for the resize 

2196 operation expected to have status "confirming" 

2197 """ 

2198 task = cross_cell_migrate.ConfirmResizeTask( 

2199 context, instance, migration, self.notifier, self.compute_rpcapi) 

2200 task.execute() 

2201 

2202 @targets_cell 

2203 # NOTE(mriedem): Upon successful completion of RevertResizeTask the 

2204 # instance is hard-deleted, along with its instance action record(s), from 

2205 # the target cell database so EventReporter hits InstanceActionNotFound on 

2206 # __exit__. Pass graceful_exit=True to avoid an ugly traceback. 

2207 @wrap_instance_event(prefix='conductor', graceful_exit=True) 

2208 def revert_snapshot_based_resize(self, context, instance, migration): 

2209 """Executes the RevertResizeTask 

2210 

2211 :param context: nova auth request context targeted at the target cell 

2212 :param instance: Instance object in "resized" status from the target 

2213 cell 

2214 :param migration: Migration object from the target cell for the resize 

2215 operation expected to have status "reverting" 

2216 """ 

2217 task = cross_cell_migrate.RevertResizeTask( 

2218 context, instance, migration, self.notifier, self.compute_rpcapi) 

2219 task.execute()