Coverage for nova/conductor/manager.py: 89%

3# Licensed under the Apache License, Version 2.0 (the "License"); you may

4# not use this file except in compliance with the License. You may obtain

5# a copy of the License at

7# http://www.apache.org/licenses/LICENSE-2.0

9# Unless required by applicable law or agreed to in writing, software

10# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

11# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

12# License for the specific language governing permissions and limitations

13# under the License.

15"""Handles database requests from other nova services."""

17import collections

18import contextlib

19import copy

20import eventlet

21import functools

22import sys

23import typing as ty

25from keystoneauth1 import exceptions as ks_exc

26from oslo_config import cfg

27from oslo_db import exception as db_exc

28from oslo_limit import exception as limit_exceptions

29from oslo_log import log as logging

30import oslo_messaging as messaging

31from oslo_serialization import jsonutils

32from oslo_utils import excutils

33from oslo_utils import timeutils

34from oslo_utils import versionutils

36from nova.accelerator import cyborg

37from nova import availability_zones

38from nova.compute import instance_actions

39from nova.compute import rpcapi as compute_rpcapi

40from nova.compute import task_states

41from nova.compute import utils as compute_utils

42from nova.compute.utils import wrap_instance_event

43from nova.compute import vm_states

44from nova.conductor.tasks import cross_cell_migrate

45from nova.conductor.tasks import live_migrate

46from nova.conductor.tasks import migrate

47from nova import context as nova_context

48from nova import exception

49from nova.i18n import _

50from nova.image import glance

51from nova.limit import placement as placement_limits

52from nova.limit import utils as limit_utils

53from nova import manager

54from nova.network import neutron

55from nova import notifications

56from nova import objects

57from nova.objects import base as nova_object

58from nova.objects import fields

59from nova import profiler

60from nova import rpc

61from nova.scheduler.client import query

62from nova.scheduler.client import report

63from nova.scheduler import utils as scheduler_utils

64from nova import servicegroup

65from nova import utils

66from nova.volume import cinder

68LOG = logging.getLogger(__name__)

69CONF = cfg.CONF

72def targets_cell(fn):

73 """Wrap a method and automatically target the instance's cell.

75 This decorates a method with signature func(self, context, instance, ...)

76 and automatically targets the context with the instance's cell

77 mapping. It does this by looking up the InstanceMapping.

78 """

79 @functools.wraps(fn)

80 def wrapper(self, context, *args, **kwargs):

81 instance = kwargs.get('instance') or args[0]

82 try:

83 im = objects.InstanceMapping.get_by_instance_uuid(

84 context, instance.uuid)

85 except exception.InstanceMappingNotFound:

86 LOG.error('InstanceMapping not found, unable to target cell',

87 instance=instance)

88 except db_exc.CantStartEngineError:

89 # Check to see if we can ignore API DB connection failures

90 # because we might already be in the cell conductor.

91 with excutils.save_and_reraise_exception() as err_ctxt:

92 if CONF.api_database.connection is None:

93 err_ctxt.reraise = False

94 else:

95 LOG.debug('Targeting cell %(cell)s for conductor method %(meth)s',

96 {'cell': im.cell_mapping.identity,

97 'meth': fn.__name__})

98 # NOTE(danms): Target our context to the cell for the rest of

99 # this request, so that none of the subsequent code needs to

100 # care about it.

101 nova_context.set_target_cell(context, im.cell_mapping)

102 return fn(self, context, *args, **kwargs)

103 return wrapper

104

105

106class ConductorManager(manager.Manager):

107 """Mission: Conduct things.

108

109 The methods in the base API for nova-conductor are various proxy operations

110 performed on behalf of the nova-compute service running on compute nodes.

111 Compute nodes are not allowed to directly access the database, so this set

112 of methods allows them to get specific work done without locally accessing

113 the database.

114

115 The nova-conductor service also exposes an API in the 'compute_task'

116 namespace. See the ComputeTaskManager class for details.

117 """

118

119 target = messaging.Target(version='3.0')

120

121 def __init__(self, *args, **kwargs):

122 super(ConductorManager, self).__init__(service_name='conductor',

123 *args, **kwargs)

124 self.compute_task_mgr = ComputeTaskManager()

125 self.additional_endpoints.append(self.compute_task_mgr)

126

127 # NOTE(hanlind): This can be removed in version 4.0 of the RPC API

128 def provider_fw_rule_get_all(self, context):

129 # NOTE(hanlind): Simulate an empty db result for compat reasons.

130 return []

131

132 def _object_dispatch(self, target, method, args, kwargs):

133 """Dispatch a call to an object method.

134

135 This ensures that object methods get called and any exception

136 that is raised gets wrapped in an ExpectedException for forwarding

137 back to the caller (without spamming the conductor logs).

138 """

139 try:

140 # NOTE(danms): Keep the getattr inside the try block since

141 # a missing method is really a client problem

142 return getattr(target, method)(*args, **kwargs)

143 except Exception:

144 raise messaging.ExpectedException()

145

146 def object_class_action_versions(self, context, objname, objmethod,

147 object_versions, args, kwargs):

148 objclass = nova_object.NovaObject.obj_class_from_name(

149 objname, object_versions[objname])

150 args = tuple([context] + list(args))

151 result = self._object_dispatch(objclass, objmethod, args, kwargs)

152 # NOTE(danms): The RPC layer will convert to primitives for us,

153 # but in this case, we need to honor the version the client is

154 # asking for, so we do it before returning here.

155 # NOTE(hanlind): Do not convert older than requested objects,

156 # see bug #1596119.

157 if isinstance(result, nova_object.NovaObject):

158 target_version = object_versions[objname]

159 requested_version = versionutils.convert_version_to_tuple(

160 target_version)

161 actual_version = versionutils.convert_version_to_tuple(

162 result.VERSION)

163 do_backport = requested_version < actual_version

164 other_major_version = requested_version[0] != actual_version[0]

165 if do_backport or other_major_version:

166 result = result.obj_to_primitive(

167 target_version=target_version,

168 version_manifest=object_versions)

169 return result

170

171 def object_action(self, context, objinst, objmethod, args, kwargs):

172 """Perform an action on an object."""

173 oldobj = objinst.obj_clone()

174 result = self._object_dispatch(objinst, objmethod, args, kwargs)

175 updates = dict()

176 # NOTE(danms): Diff the object with the one passed to us and

177 # generate a list of changes to forward back

178 for name, field in objinst.fields.items():

179 if not objinst.obj_attr_is_set(name): 179 ↛ 181line 179 didn't jump to line 181 because the condition on line 179 was never true

180 # Avoid demand-loading anything

181 continue

182 if (not oldobj.obj_attr_is_set(name) or 182 ↛ 178line 182 didn't jump to line 178 because the condition on line 182 was always true

183 getattr(oldobj, name) != getattr(objinst, name)):

184 updates[name] = field.to_primitive(objinst, name,

185 getattr(objinst, name))

186 # This is safe since a field named this would conflict with the

187 # method anyway

188 updates['obj_what_changed'] = objinst.obj_what_changed()

189 return updates, result

190

191 def object_backport_versions(self, context, objinst, object_versions):

192 target = object_versions[objinst.obj_name()]

193 LOG.debug('Backporting %(obj)s to %(ver)s with versions %(manifest)s',

194 {'obj': objinst.obj_name(),

195 'ver': target,

196 'manifest': ','.join(

197 ['%s=%s' % (name, ver)

198 for name, ver in object_versions.items()])})

199 return objinst.obj_to_primitive(target_version=target,

200 version_manifest=object_versions)

201

202 def reset(self):

203 objects.Service.clear_min_version_cache()

204

205

206@contextlib.contextmanager

207def try_target_cell(context, cell):

208 """If cell is not None call func with context.target_cell.

209

210 This is a method to help during the transition period. Currently

211 various mappings may not exist if a deployment has not migrated to

212 cellsv2. If there is no mapping call the func as normal, otherwise

213 call it in a target_cell context.

214 """

215 if cell: 215 ↛ 219line 215 didn't jump to line 219 because the condition on line 215 was always true

216 with nova_context.target_cell(context, cell) as cell_context:

217 yield cell_context

218 else:

219 yield context

220

221

222@contextlib.contextmanager

223def obj_target_cell(obj, cell):

224 """Run with object's context set to a specific cell"""

225 with try_target_cell(obj._context, cell) as target:

226 with obj.obj_alternate_context(target):

227 yield target

228

229

230@profiler.trace_cls("rpc")

231class ComputeTaskManager:

232 """Namespace for compute methods.

233

234 This class presents an rpc API for nova-conductor under the 'compute_task'

235 namespace. The methods here are compute operations that are invoked

236 by the API service. These methods see the operation to completion, which

237 may involve coordinating activities on multiple compute nodes.

238 """

239

240 target = messaging.Target(namespace='compute_task', version='1.25')

241

242 def __init__(self):

243 self.compute_rpcapi = compute_rpcapi.ComputeAPI()

244 self.volume_api = cinder.API()

245 self.image_api = glance.API()

246 self.network_api = neutron.API()

247 self.servicegroup_api = servicegroup.API()

248 self.query_client = query.SchedulerQueryClient()

249 self.notifier = rpc.get_notifier('compute')

250 # Help us to record host in EventReporter

251 self.host = CONF.host

252

253 try:

254 # Test our placement client during initialization

255 self.report_client

256 except (ks_exc.EndpointNotFound,

257 ks_exc.DiscoveryFailure,

258 ks_exc.RequestTimeout,

259 ks_exc.GatewayTimeout,

260 ks_exc.ConnectFailure) as e:

261 # Non-fatal, likely transient (although not definitely);

262 # continue startup but log the warning so that when things

263 # fail later, it will be clear why we can not do certain

264 # things.

265 LOG.warning('Unable to initialize placement client (%s); '

266 'Continuing with startup, but some operations '

267 'will not be possible.', e)

268 except (ks_exc.MissingAuthPlugin,

269 ks_exc.Unauthorized) as e:

270 # This is almost definitely fatal mis-configuration. The

271 # Unauthorized error might be transient, but it is

272 # probably reasonable to consider it fatal.

273 LOG.error('Fatal error initializing placement client; '

274 'config is incorrect or incomplete: %s', e)

275 raise

276 except Exception as e:

277 # Unknown/unexpected errors here are fatal

278 LOG.error('Fatal error initializing placement client: %s', e)

279 raise

280

281 @property

282 def report_client(self):

283 return report.report_client_singleton()

284

285 def reset(self):

286 LOG.info('Reloading compute RPC API')

287 compute_rpcapi.LAST_VERSION = None

288 self.compute_rpcapi = compute_rpcapi.ComputeAPI()

289

290 # TODO(tdurakov): remove `live` parameter here on compute task api RPC

291 # version bump to 2.x

292 # TODO(danms): remove the `reservations` parameter here on compute task api

293 # RPC version bump to 2.x

294 @messaging.expected_exceptions(

295 exception.NoValidHost,

296 exception.ComputeServiceUnavailable,

297 exception.ComputeHostNotFound,

298 exception.InvalidHypervisorType,

299 exception.InvalidCPUInfo,

300 exception.UnableToMigrateToSelf,

301 exception.DestinationHypervisorTooOld,

302 exception.InvalidLocalStorage,

303 exception.InvalidSharedStorage,

304 exception.HypervisorUnavailable,

305 exception.InstanceInvalidState,

306 exception.MigrationPreCheckError,

307 exception.UnsupportedPolicyException)

308 @targets_cell

309 @wrap_instance_event(prefix='conductor')

310 def migrate_server(self, context, instance, scheduler_hint, live, rebuild,

311 flavor, block_migration, disk_over_commit, reservations=None,

312 clean_shutdown=True, request_spec=None, host_list=None):

313 if instance and not isinstance(instance, nova_object.NovaObject): 313 ↛ 316line 313 didn't jump to line 316 because the condition on line 313 was never true

314 # NOTE(danms): Until v2 of the RPC API, we need to tolerate

315 # old-world instance objects here

316 attrs = ['metadata', 'system_metadata', 'info_cache',

317 'security_groups']

318 instance = objects.Instance._from_db_object(

319 context, objects.Instance(), instance,

320 expected_attrs=attrs)

321 # NOTE: Remove this when we drop support for v1 of the RPC API

322 if flavor and not isinstance(flavor, objects.Flavor): 322 ↛ 325line 322 didn't jump to line 325 because the condition on line 322 was never true

323 # Code downstream may expect extra_specs to be populated since it

324 # is receiving an object, so lookup the flavor to ensure this.

325 flavor = objects.Flavor.get_by_id(context, flavor['id'])

326 if live and not rebuild and not flavor:

327 self._live_migrate(context, instance, scheduler_hint,

328 block_migration, disk_over_commit, request_spec)

329 elif not live and not rebuild and flavor:

330 instance_uuid = instance.uuid

331 with compute_utils.EventReporter(context, 'cold_migrate',

332 self.host, instance_uuid):

333 self._cold_migrate(context, instance, flavor,

334 scheduler_hint['filter_properties'],

335 clean_shutdown, request_spec,

336 host_list)

337 else:

338 raise NotImplementedError()

339

340 @staticmethod

341 def _get_request_spec_for_cold_migrate(context, instance, flavor,

342 filter_properties, request_spec):

343 # NOTE(sbauza): If a reschedule occurs when prep_resize(), then

344 # it only provides filter_properties legacy dict back to the

345 # conductor with no RequestSpec part of the payload for <Stein

346 # computes.

347 # TODO(mriedem): We can remove this compat code for no request spec

348 # coming to conductor in ComputeTaskAPI RPC API version 2.0

349 if not request_spec:

350 image_meta = utils.get_image_from_system_metadata(

351 instance.system_metadata)

352 # Make sure we hydrate a new RequestSpec object with the new flavor

353 # and not the nested one from the instance

354 request_spec = objects.RequestSpec.from_components(

355 context, instance.uuid, image_meta,

356 flavor, instance.numa_topology, instance.pci_requests,

357 filter_properties, None, instance.availability_zone,

358 project_id=instance.project_id, user_id=instance.user_id)

359 elif not isinstance(request_spec, objects.RequestSpec):

360 # Prior to compute RPC API 5.1 conductor would pass a legacy dict

361 # version of the request spec to compute and Stein compute

362 # could be sending that back to conductor on reschedule, so if we

363 # got a dict convert it to an object.

364 # TODO(mriedem): We can drop this compat code when we only support

365 # compute RPC API >=6.0.

366 request_spec = objects.RequestSpec.from_primitives(

367 context, request_spec, filter_properties)

368 # We don't have to set the new flavor on the request spec because

369 # if we got here it was due to a reschedule from the compute and

370 # the request spec would already have the new flavor in it from the

371 # else block below.

372 else:

373 # NOTE(sbauza): Resizes means new flavor, so we need to update the

374 # original RequestSpec object for make sure the scheduler verifies

375 # the right one and not the original flavor

376 request_spec.flavor = flavor

377 return request_spec

378

379 def _cold_migrate(self, context, instance, flavor, filter_properties,

380 clean_shutdown, request_spec, host_list):

381 request_spec = self._get_request_spec_for_cold_migrate(

382 context, instance, flavor, filter_properties, request_spec)

383

384 task = self._build_cold_migrate_task(context, instance, flavor,

385 request_spec, clean_shutdown, host_list)

386 try:

387 task.execute()

388 except exception.NoValidHost as ex:

389 vm_state = instance.vm_state

390 if not vm_state: 390 ↛ 391line 390 didn't jump to line 391 because the condition on line 390 was never true

391 vm_state = vm_states.ACTIVE

392 updates = {'vm_state': vm_state, 'task_state': None}

393 self._set_vm_state_and_notify(context, instance.uuid,

394 'migrate_server',

395 updates, ex, request_spec)

396

397 # if the flavor IDs match, it's migrate; otherwise resize

398 if flavor.id == instance.instance_type_id:

399 msg = _("No valid host found for cold migrate")

400 else:

401 msg = _("No valid host found for resize")

402 raise exception.NoValidHost(reason=msg)

403 except exception.UnsupportedPolicyException as ex:

404 with excutils.save_and_reraise_exception():

405 vm_state = instance.vm_state

406 if not vm_state: 406 ↛ 407line 406 didn't jump to line 407 because the condition on line 406 was never true

407 vm_state = vm_states.ACTIVE

408 updates = {'vm_state': vm_state, 'task_state': None}

409 self._set_vm_state_and_notify(context, instance.uuid,

410 'migrate_server',

411 updates, ex, request_spec)

412 except Exception as ex:

413 with excutils.save_and_reraise_exception():

414 # Refresh the instance so we don't overwrite vm_state changes

415 # set after we executed the task.

416 try:

417 instance.refresh()

418 # Passing vm_state is kind of silly but it's expected in

419 # set_vm_state_and_notify.

420 updates = {'vm_state': instance.vm_state,

421 'task_state': None}

422 self._set_vm_state_and_notify(context, instance.uuid,

423 'migrate_server',

424 updates, ex, request_spec)

425 except exception.InstanceNotFound:

426 # We can't send the notification because the instance is

427 # gone so just log it.

428 LOG.info('During %s the instance was deleted.',

429 'resize' if instance.instance_type_id != flavor.id

430 else 'cold migrate', instance=instance)

431 # NOTE(sbauza): Make sure we persist the new flavor in case we had

432 # a successful scheduler call if and only if nothing bad happened

433 if request_spec.obj_what_changed(): 433 ↛ exitline 433 didn't return from function '_cold_migrate' because the condition on line 433 was always true

434 request_spec.save()

435

436 def _set_vm_state_and_notify(self, context, instance_uuid, method, updates,

437 ex, request_spec):

438 scheduler_utils.set_vm_state_and_notify(

439 context, instance_uuid, 'compute_task', method, updates,

440 ex, request_spec)

441

442 def _cleanup_allocated_networks(

443 self, context, instance, requested_networks):

444 try:

445 # If we were told not to allocate networks let's save ourselves

446 # the trouble of calling the network API.

447 if not (requested_networks and requested_networks.no_allocate):

448 self.network_api.deallocate_for_instance(

449 context, instance, requested_networks=requested_networks)

450 except Exception:

451 LOG.exception('Failed to deallocate networks', instance=instance)

452 return

453

454 instance.system_metadata['network_allocated'] = 'False'

455 try:

456 instance.save()

457 except exception.InstanceNotFound:

458 # NOTE: It's possible that we're cleaning up the networks

459 # because the instance was deleted. If that's the case then this

460 # exception will be raised by instance.save()

461 pass

462

463 @targets_cell

464 @wrap_instance_event(prefix='conductor')

465 def live_migrate_instance(self, context, instance, scheduler_hint,

466 block_migration, disk_over_commit, request_spec):

467 self._live_migrate(context, instance, scheduler_hint,

468 block_migration, disk_over_commit, request_spec)

469

470 def _live_migrate(self, context, instance, scheduler_hint,

471 block_migration, disk_over_commit, request_spec):

472 destination = scheduler_hint.get("host")

473

474 def _set_vm_state(context, instance, ex, vm_state=None,

475 task_state=None):

476 request_spec = {'instance_properties': {

477 'uuid': instance.uuid, },

478 }

479 scheduler_utils.set_vm_state_and_notify(context,

480 instance.uuid,

481 'compute_task', 'migrate_server',

482 dict(vm_state=vm_state,

483 task_state=task_state,

484 expected_task_state=task_states.MIGRATING,),

485 ex, request_spec)

486

487 migration = objects.Migration(context=context.elevated())

488 migration.dest_compute = destination

489 migration.status = 'accepted'

490 migration.instance_uuid = instance.uuid

491 migration.source_compute = instance.host

492 migration.migration_type = fields.MigrationType.LIVE_MIGRATION

493 if instance.obj_attr_is_set('flavor'):

494 migration.old_instance_type_id = instance.flavor.id

495 migration.new_instance_type_id = instance.flavor.id

496 else:

497 migration.old_instance_type_id = instance.instance_type_id

498 migration.new_instance_type_id = instance.instance_type_id

499 migration.create()

500

501 task = self._build_live_migrate_task(context, instance, destination,

502 block_migration, disk_over_commit,

503 migration, request_spec)

504 try:

505 task.execute()

506 except (exception.NoValidHost,

507 exception.ComputeHostNotFound,

508 exception.ComputeServiceUnavailable,

509 exception.InvalidHypervisorType,

510 exception.InvalidCPUInfo,

511 exception.UnableToMigrateToSelf,

512 exception.DestinationHypervisorTooOld,

513 exception.InvalidLocalStorage,

514 exception.InvalidSharedStorage,

515 exception.HypervisorUnavailable,

516 exception.InstanceInvalidState,

517 exception.MigrationPreCheckError,

518 exception.MigrationSchedulerRPCError) as ex:

519 with excutils.save_and_reraise_exception():

520 _set_vm_state(context, instance, ex, instance.vm_state)

521 migration.status = 'error'

522 migration.save()

523 except Exception as ex:

524 LOG.error('Migration of instance %(instance_id)s to host'

525 ' %(dest)s unexpectedly failed.',

526 {'instance_id': instance.uuid, 'dest': destination},

527 exc_info=True)

528 # Reset the task state to None to indicate completion of

529 # the operation as it is done in case of known exceptions.

530 _set_vm_state(context, instance, ex, vm_states.ERROR,

531 task_state=None)

532 migration.status = 'error'

533 migration.save()

534 raise exception.MigrationError(reason=str(ex))

535

536 def _build_live_migrate_task(self, context, instance, destination,

537 block_migration, disk_over_commit, migration,

538 request_spec=None):

539 return live_migrate.LiveMigrationTask(context, instance,

540 destination, block_migration,

541 disk_over_commit, migration,

542 self.compute_rpcapi,

543 self.servicegroup_api,

544 self.query_client,

545 self.report_client,

546 request_spec)

547

548 def _build_cold_migrate_task(self, context, instance, flavor, request_spec,

549 clean_shutdown, host_list):

550 return migrate.MigrationTask(context, instance, flavor,

551 request_spec, clean_shutdown,

552 self.compute_rpcapi,

553 self.query_client, self.report_client,

554 host_list, self.network_api)

555

556 def _destroy_build_request(self, context, instance):

557 # The BuildRequest needs to be stored until the instance is mapped to

558 # an instance table. At that point it will never be used again and

559 # should be deleted.

560 build_request = objects.BuildRequest.get_by_instance_uuid(

561 context, instance.uuid)

562 # TODO(alaski): Sync API updates of the build_request to the

563 # instance before it is destroyed. Right now only locked_by can

564 # be updated before this is destroyed.

565 build_request.destroy()

566

567 def _populate_instance_mapping(self, context, instance, host):

568 try:

569 inst_mapping = objects.InstanceMapping.get_by_instance_uuid(

570 context, instance.uuid)

571 except exception.InstanceMappingNotFound:

572 # NOTE(alaski): If nova-api is up to date this exception should

573 # never be hit. But during an upgrade it's possible that an old

574 # nova-api didn't create an instance_mapping during this boot

575 # request.

576 LOG.debug('Instance was not mapped to a cell, likely due '

577 'to an older nova-api service running.',

578 instance=instance)

579 return None

580 else:

581 try:

582 host_mapping = objects.HostMapping.get_by_host(context,

583 host.service_host)

584 except exception.HostMappingNotFound:

585 # NOTE(alaski): For now this exception means that a

586 # deployment has not migrated to cellsv2 and we should

587 # remove the instance_mapping that has been created.

588 # Eventually this will indicate a failure to properly map a

589 # host to a cell and we may want to reschedule.

590 inst_mapping.destroy()

591 return None

592 else:

593 inst_mapping.cell_mapping = host_mapping.cell_mapping

594 inst_mapping.save()

595 return inst_mapping

596

597 def _validate_existing_attachment_ids(self, context, instance, bdms):

598 """Ensure any attachment ids referenced by the bdms exist.

599

600 New attachments will only be created if the attachment ids referenced

601 by the bdms no longer exist. This can happen when an instance is

602 rescheduled after a failure to spawn as cleanup code on the previous

603 host will delete attachments before rescheduling.

604 """

605 for bdm in bdms:

606 if bdm.is_volume and bdm.attachment_id: 606 ↛ 605line 606 didn't jump to line 605 because the condition on line 606 was always true

607 try:

608 self.volume_api.attachment_get(context, bdm.attachment_id)

609 except exception.VolumeAttachmentNotFound:

610 attachment = self.volume_api.attachment_create(

611 context, bdm.volume_id, instance.uuid)

612 bdm.attachment_id = attachment['id']

613 bdm.save()

614

615 def _cleanup_when_reschedule_fails(

616 self, context, instance, exception, legacy_request_spec,

617 requested_networks):

618 """Set the instance state and clean up.

619

620 It is only used in case build_instance fails while rescheduling the

621 instance

622 """

623

624 updates = {'vm_state': vm_states.ERROR,

625 'task_state': None}

626 self._set_vm_state_and_notify(

627 context, instance.uuid, 'build_instances', updates, exception,

628 legacy_request_spec)

629 self._cleanup_allocated_networks(

630 context, instance, requested_networks)

631

632 arq_uuids = None

633 # arqs have not bound to port/instance yet

634 if requested_networks:

635 arq_uuids = [req.arq_uuid

636 for req in requested_networks if req.arq_uuid]

637 compute_utils.delete_arqs_if_needed(context, instance, arq_uuids)

638

639 # NOTE(danms): This is never cell-targeted because it is only used for

640 # n-cpu reschedules which go to the cell conductor and thus are always

641 # cell-specific.

642 def build_instances(self, context, instances, image, filter_properties,

643 admin_password, injected_files, requested_networks,

644 security_groups, block_device_mapping=None, legacy_bdm=True,

645 request_spec=None, host_lists=None):

646 # TODO(ndipanov): Remove block_device_mapping and legacy_bdm in version

647 # 2.0 of the RPC API.

648 # TODO(danms): Remove this in version 2.0 of the RPC API

649 if (requested_networks and 649 ↛ 652line 649 didn't jump to line 652 because the condition on line 649 was never true

650 not isinstance(requested_networks,

651 objects.NetworkRequestList)):

652 requested_networks = objects.NetworkRequestList.from_tuples(

653 requested_networks)

654 # TODO(melwitt): Remove this in version 2.0 of the RPC API

655 flavor = filter_properties.get('instance_type')

656 if flavor and not isinstance(flavor, objects.Flavor): 656 ↛ 659line 656 didn't jump to line 659 because the condition on line 656 was never true

657 # Code downstream may expect extra_specs to be populated since it

658 # is receiving an object, so lookup the flavor to ensure this.

659 flavor = objects.Flavor.get_by_id(context, flavor['id'])

660 filter_properties = dict(filter_properties, instance_type=flavor)

661

662 # Older computes will not send a request_spec during reschedules so we

663 # need to check and build our own if one is not provided.

664 if request_spec is None:

665 legacy_request_spec = scheduler_utils.build_request_spec(

666 image, instances)

667 else:

668 # TODO(mriedem): This is annoying but to populate the local

669 # request spec below using the filter_properties, we have to pass

670 # in a primitive version of the request spec. Yes it's inefficient

671 # and we can remove it once the populate_retry and

672 # populate_filter_properties utility methods are converted to

673 # work on a RequestSpec object rather than filter_properties.

674 # NOTE(gibi): we have to keep a reference to the original

675 # RequestSpec object passed to this function as we lose information

676 # during the below legacy conversion

677 legacy_request_spec = request_spec.to_legacy_request_spec_dict()

678

679 # 'host_lists' will be None during a reschedule from a pre-Queens

680 # compute. In all other cases, it will be a list of lists, though the

681 # lists may be empty if there are no more hosts left in a rescheduling

682 # situation.

683 is_reschedule = host_lists is not None

684 try:

685 # check retry policy. Rather ugly use of instances[0]...

686 # but if we've exceeded max retries... then we really only

687 # have a single instance.

688 # TODO(sbauza): Provide directly the RequestSpec object

689 # when populate_retry() accepts it

690 scheduler_utils.populate_retry(

691 filter_properties, instances[0].uuid)

692 instance_uuids = [instance.uuid for instance in instances]

693 spec_obj = objects.RequestSpec.from_primitives(

694 context, legacy_request_spec, filter_properties)

695 LOG.debug("Rescheduling: %s", is_reschedule)

696 if is_reschedule:

697 # Make sure that we have a host, as we may have exhausted all

698 # our alternates

699 if not host_lists[0]: 699 ↛ 702line 699 didn't jump to line 702 because the condition on line 699 was never true

700 # We have an empty list of hosts, so this instance has

701 # failed to build.

702 msg = ("Exhausted all hosts available for retrying build "

703 "failures for instance %(instance_uuid)s." %

704 {"instance_uuid": instances[0].uuid})

705 raise exception.MaxRetriesExceeded(reason=msg)

706 else:

707 # This is not a reschedule, so we need to call the scheduler to

708 # get appropriate hosts for the request.

709 # NOTE(gibi): We only call the scheduler if we are rescheduling

710 # from a really old compute. In that case we do not support

711 # externally-defined resource requests, like port QoS. So no

712 # requested_resources are set on the RequestSpec here.

713 host_lists = self._schedule_instances(context, spec_obj,

714 instance_uuids, return_alternates=True)

715 except Exception as exc:

716 # NOTE(mriedem): If we're rescheduling from a failed build on a

717 # compute, "retry" will be set and num_attempts will be >1 because

718 # populate_retry above will increment it. If the server build was

719 # forced onto a host/node or [scheduler]/max_attempts=1, "retry"

720 # won't be in filter_properties and we won't get here because

721 # nova-compute will just abort the build since reschedules are

722 # disabled in those cases.

723 num_attempts = filter_properties.get(

724 'retry', {}).get('num_attempts', 1)

725 for instance in instances:

726 # If num_attempts > 1, we're in a reschedule and probably

727 # either hit NoValidHost or MaxRetriesExceeded. Either way,

728 # the build request should already be gone and we probably

729 # can't reach the API DB from the cell conductor.

730 if num_attempts <= 1:

731 try:

732 # If the BuildRequest stays around then instance

733 # show/lists will pull from it rather than the errored

734 # instance.

735 self._destroy_build_request(context, instance)

736 except exception.BuildRequestNotFound:

737 pass

738 self._cleanup_when_reschedule_fails(

739 context, instance, exc, legacy_request_spec,

740 requested_networks)

741 return

742

743 elevated = context.elevated()

744 for (instance, host_list) in zip(instances, host_lists):

745 host = host_list.pop(0)

746 if is_reschedule:

747 # If this runs in the superconductor, the first instance will

748 # already have its resources claimed in placement. If this is a

749 # retry, though, this is running in the cell conductor, and we

750 # need to claim first to ensure that the alternate host still

751 # has its resources available. Note that there are schedulers

752 # that don't support Placement, so must assume that the host is

753 # still available.

754 host_available = False

755 while host and not host_available:

756 if host.allocation_request: 756 ↛ 759line 756 didn't jump to line 759 because the condition on line 756 was always true

757 alloc_req = jsonutils.loads(host.allocation_request)

758 else:

759 alloc_req = None

760 if alloc_req: 760 ↛ 784line 760 didn't jump to line 784 because the condition on line 760 was always true

761 try:

762 host_available = scheduler_utils.claim_resources(

763 elevated, self.report_client, spec_obj,

764 instance.uuid, alloc_req,

765 host.allocation_request_version)

766 if request_spec and host_available:

767 # NOTE(gibi): redo the request group - resource

768 # provider mapping as the above claim call

769 # moves the allocation of the instance to

770 # another host

771 scheduler_utils.fill_provider_mapping(

772 request_spec, host)

773 except Exception as exc:

774 self._cleanup_when_reschedule_fails(

775 context, instance, exc, legacy_request_spec,

776 requested_networks)

777 return

778 else:

779 # Some deployments use different schedulers that do not

780 # use Placement, so they will not have an

781 # allocation_request to claim with. For those cases,

782 # there is no concept of claiming, so just assume that

783 # the host is valid.

784 host_available = True

785 if not host_available:

786 # Insufficient resources remain on that host, so

787 # discard it and try the next.

788 host = host_list.pop(0) if host_list else None

789 if not host_available:

790 # No more available hosts for retrying the build.

791 msg = ("Exhausted all hosts available for retrying build "

792 "failures for instance %(instance_uuid)s." %

793 {"instance_uuid": instance.uuid})

794 exc = exception.MaxRetriesExceeded(reason=msg)

795 self._cleanup_when_reschedule_fails(

796 context, instance, exc, legacy_request_spec,

797 requested_networks)

798 return

799

800 # The availability_zone field was added in v1.1 of the Selection

801 # object so make sure to handle the case where it is missing.

802 if 'availability_zone' in host: 802 ↛ 803line 802 didn't jump to line 803 because the condition on line 802 was never true

803 instance.availability_zone = host.availability_zone

804 else:

805 try:

806 instance.availability_zone = (

807 availability_zones.get_host_availability_zone(context,

808 host.service_host))

809 except Exception as exc:

810 # Put the instance into ERROR state, set task_state to

811 # None, inject a fault, etc.

812 self._cleanup_when_reschedule_fails(

813 context, instance, exc, legacy_request_spec,

814 requested_networks)

815 continue

816

817 try:

818 # NOTE(danms): This saves the az change above, refreshes our

819 # instance, and tells us if it has been deleted underneath us

820 instance.save()

821 except (exception.InstanceNotFound,

822 exception.InstanceInfoCacheNotFound):

823 LOG.debug('Instance deleted during build', instance=instance)

824 continue

825 local_filter_props = copy.deepcopy(filter_properties)

826 scheduler_utils.populate_filter_properties(local_filter_props,

827 host)

828 # Populate the request_spec with the local_filter_props information

829 # like retries and limits. Note that at this point the request_spec

830 # could have come from a compute via reschedule and it would

831 # already have some things set, like scheduler_hints.

832 local_reqspec = objects.RequestSpec.from_primitives(

833 context, legacy_request_spec, local_filter_props)

834

835 # NOTE(gibi): at this point the request spec already got converted

836 # to a legacy dict and then back to an object so we lost the non

837 # legacy part of the spec. Re-populate the requested_resources

838 # field based on the original request spec object passed to this

839 # function.

840 if request_spec:

841 local_reqspec.requested_resources = (

842 request_spec.requested_resources)

843

844 # The block_device_mapping passed from the api doesn't contain

845 # instance specific information

846 bdms = objects.BlockDeviceMappingList.get_by_instance_uuid(

847 context, instance.uuid)

848

849 # This is populated in scheduler_utils.populate_retry

850 num_attempts = local_filter_props.get('retry',

851 {}).get('num_attempts', 1)

852 if num_attempts <= 1:

853 # If this is a reschedule the instance is already mapped to

854 # this cell and the BuildRequest is already deleted so ignore

855 # the logic below.

856 inst_mapping = self._populate_instance_mapping(context,

857 instance,

858 host)

859 try:

860 self._destroy_build_request(context, instance)

861 except exception.BuildRequestNotFound:

862 # This indicates an instance delete has been requested in

863 # the API. Stop the build, cleanup the instance_mapping and

864 # potentially the block_device_mappings

865 # TODO(alaski): Handle block_device_mapping cleanup

866 if inst_mapping:

867 inst_mapping.destroy()

868 return

869 else:

870 # NOTE(lyarwood): If this is a reschedule then recreate any

871 # attachments that were previously removed when cleaning up

872 # after failures to spawn etc.

873 self._validate_existing_attachment_ids(context, instance, bdms)

874

875 alts = [(alt.service_host, alt.nodename) for alt in host_list]

876 LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s",

877 host.service_host, host.nodename, alts, instance=instance)

878

879 try:

880 accel_uuids = self._create_and_bind_arq_for_instance(

881 context, instance, host.nodename, local_reqspec,

882 requested_networks)

883 except Exception as exc:

884 LOG.exception('Failed to reschedule. Reason: %s', exc)

885 self._cleanup_when_reschedule_fails(

886 context, instance, exc, legacy_request_spec,

887 requested_networks)

888 continue

889

890 self.compute_rpcapi.build_and_run_instance(context,

891 instance=instance, host=host.service_host, image=image,

892 request_spec=local_reqspec,

893 filter_properties=local_filter_props,

894 admin_password=admin_password,

895 injected_files=injected_files,

896 requested_networks=requested_networks,

897 security_groups=security_groups,

898 block_device_mapping=bdms, node=host.nodename,

899 limits=host.limits, host_list=host_list,

900 accel_uuids=accel_uuids)

901

902 def _create_and_bind_arq_for_instance(

903 self, context, instance, hostname,

904 request_spec, requested_networks=None):

905 try:

906 resource_provider_mapping = (

907 request_spec.get_request_group_mapping())

908 # Using nodename instead of hostname. See:

909 # http://lists.openstack.org/pipermail/openstack-discuss/2019-November/011044.html # noqa

910 cyclient = cyborg.get_client(context)

911 bindings = {}

912 port_bindings = {}

913

914 # Create ARQs comes from extra specs.

915 bindings = self._create_and_bind_arqs(

916 cyclient, instance.uuid, instance.flavor.extra_specs,

917 hostname, resource_provider_mapping)

918

919 if requested_networks:

920 # Create ARQs comes from port device profile

921 port_bindings = self._create_arqs_for_ports(

922 cyclient, instance.uuid, requested_networks,

923 hostname, resource_provider_mapping)

924

925 # Initiate Cyborg binding asynchronously

926 bindings.update(port_bindings)

927 if bindings:

928 cyclient.bind_arqs(bindings)

929

930 return list(bindings.keys())

931

932 except exception.AcceleratorRequestBindingFailed as exc:

933 # If anything failed here we need to cleanup and bail out.

934 cyclient = cyborg.get_client(context)

935 cyclient.delete_arqs_by_uuid(exc.arqs)

936 raise

937

938 def _schedule_instances(self, context, request_spec,

939 instance_uuids=None, return_alternates=False):

940 scheduler_utils.setup_instance_group(context, request_spec)

941 with timeutils.StopWatch() as timer:

942 host_lists = self.query_client.select_destinations(

943 context, request_spec, instance_uuids, return_objects=True,

944 return_alternates=return_alternates)

945 LOG.debug('Took %0.2f seconds to select destinations for %s '

946 'instance(s).', timer.elapsed(), len(instance_uuids))

947 return host_lists

948

949 @staticmethod

950 def _restrict_request_spec_to_cell(context, instance, request_spec):

951 """Sets RequestSpec.requested_destination.cell for the move operation

952

953 Move operations, e.g. evacuate and unshelve, must be restricted to the

954 cell in which the instance already exists, so this method is used to

955 target the RequestSpec, which is sent to the scheduler via the

956 _schedule_instances method, to the instance's current cell.

957

958 :param context: nova auth RequestContext

959 """

960 instance_mapping = \

961 objects.InstanceMapping.get_by_instance_uuid(

962 context, instance.uuid)

963 LOG.debug('Requesting cell %(cell)s during scheduling',

964 {'cell': instance_mapping.cell_mapping.identity},

965 instance=instance)

966 if ('requested_destination' in request_spec and

967 request_spec.requested_destination):

968 request_spec.requested_destination.cell = (

969 instance_mapping.cell_mapping)

970 else:

971 request_spec.requested_destination = (

972 objects.Destination(

973 cell=instance_mapping.cell_mapping))

974

975 def _recheck_quota(

976 self,

977 context: nova_context.RequestContext,

978 flavor: 'objects.Flavor',

979 request_spec: 'objects.RequestSpec',

980 orig_num_req: int,

981 project_id: ty.Optional[str] = None,

982 user_id: ty.Optional[str] = None

983 ) -> None:

984 # A quota "recheck" is a quota check that is performed *after* quota

985 # limited resources are consumed. It is meant to address race

986 # conditions where a request that was not over quota at the beginning

987 # of the request before resources are allocated becomes over quota

988 # after resources (like database rows or placement allocations) are

989 # created. An example of this would be a large number of requests for

990 # the same resource for the same project sent simultaneously.

991 if CONF.quota.recheck_quota:

992 # The orig_num_req is the number of instances requested, which is

993 # the delta that was quota checked before resources were allocated.

994 # This is only used for the exception message is the recheck fails

995 # for lack of enough quota.

996 compute_utils.check_num_instances_quota(

997 context, flavor, 0, 0, project_id=project_id,

998 user_id=user_id, orig_num_req=orig_num_req)

999 placement_limits.enforce_num_instances_and_flavor(

1000 context, project_id, flavor, request_spec.is_bfv, 0, 0)

1001

1002 # TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0.

1003 @targets_cell

1004 def unshelve_instance(self, context, instance, request_spec=None):

1005 sys_meta = instance.system_metadata

1006

1007 def safe_image_show(ctx, image_id):

1008 if image_id: 1008 ↛ 1011line 1008 didn't jump to line 1011 because the condition on line 1008 was always true

1009 return self.image_api.get(ctx, image_id, show_deleted=False)

1010 else:

1011 raise exception.ImageNotFound(image_id='')

1012

1013 if instance.vm_state == vm_states.SHELVED:

1014 instance.task_state = task_states.POWERING_ON

1015 instance.save(expected_task_state=task_states.UNSHELVING)

1016 self.compute_rpcapi.start_instance(context, instance)

1017 elif instance.vm_state == vm_states.SHELVED_OFFLOADED: 1017 ↛ 1155line 1017 didn't jump to line 1155 because the condition on line 1017 was always true

1018 image = None

1019 image_id = sys_meta.get('shelved_image_id')

1020 # No need to check for image if image_id is None as

1021 # "shelved_image_id" key is not set for volume backed

1022 # instance during the shelve process

1023 if image_id:

1024 with compute_utils.EventReporter(

1025 context, 'get_image_info', self.host, instance.uuid):

1026 try:

1027 image = safe_image_show(context, image_id)

1028 except exception.ImageNotFound as error:

1029 instance.vm_state = vm_states.ERROR

1030 instance.save()

1031

1032 reason = _('Unshelve attempted but the image %s '

1033 'cannot be found.') % image_id

1034

1035 LOG.error(reason, instance=instance)

1036 compute_utils.add_instance_fault_from_exc(

1037 context, instance, error, sys.exc_info(),

1038 fault_message=reason)

1039 raise exception.UnshelveException(

1040 instance_id=instance.uuid, reason=reason)

1041

1042 try:

1043 with compute_utils.EventReporter(context, 'schedule_instances',

1044 self.host, instance.uuid):

1045 # NOTE(sbauza): Force_hosts/nodes needs to be reset

1046 # if we want to make sure that the next destination

1047 # is not forced to be the original host

1048 request_spec.reset_forced_destinations()

1049 # TODO(sbauza): Provide directly the RequestSpec object

1050 # when populate_filter_properties accepts it

1051 filter_properties = request_spec.\

1052 to_legacy_filter_properties_dict()

1053 res_req, req_lvl_params = (

1054 self.network_api.get_requested_resource_for_instance(

1055 context, instance.uuid)

1056 )

1057 extra_specs = request_spec.flavor.extra_specs

1058 device_profile = extra_specs.get('accel:device_profile')

1059 res_req.extend(

1060 cyborg.get_device_profile_request_groups(

1061 context, device_profile) if device_profile else [])

1062 # NOTE(gibi): When other modules want to handle similar

1063 # non-nova resources then here we have to collect all

1064 # the external resource requests in a single list and

1065 # add them to the RequestSpec.

1066 request_spec.requested_resources = res_req

1067 request_spec.request_level_params = req_lvl_params

1068

1069 # NOTE(gibi): as PCI devices is tracked in placement we

1070 # need to generate request groups from InstancePCIRequests.

1071 # This will append new RequestGroup objects to the

1072 # request_spec.requested_resources list if needed

1073 request_spec.generate_request_groups_from_pci_requests()

1074

1075 # NOTE(cfriesen): Ensure that we restrict the scheduler to

1076 # the cell specified by the instance mapping.

1077 self._restrict_request_spec_to_cell(

1078 context, instance, request_spec)

1079

1080 request_spec.ensure_project_and_user_id(instance)

1081 request_spec.ensure_network_information(instance)

1082 compute_utils.heal_reqspec_is_bfv(

1083 context, request_spec, instance)

1084 host_lists = self._schedule_instances(context,

1085 request_spec, [instance.uuid],

1086 return_alternates=False)

1087

1088 # NOTE(melwitt): We recheck the quota after allocating the

1089 # resources in placement, to prevent users from allocating

1090 # more resources than their allowed quota in the event of a

1091 # race. This is configurable because it can be expensive if

1092 # strict quota limits are not required in a deployment.

1093 try:

1094 # Quota should only be checked for unshelve only if

1095 # resources are being counted in placement. Legacy

1096 # quotas continue to consume resources while

1097 # SHELVED_OFFLOADED and will not allocate any new

1098 # resources during unshelve.

1099 if (CONF.quota.count_usage_from_placement or

1100 limit_utils.use_unified_limits()):

1101 self._recheck_quota(

1102 context, instance.flavor, request_spec, 0,

1103 project_id=instance.project_id,

1104 user_id=instance.user_id)

1105 except (exception.TooManyInstances,

1106 limit_exceptions.ProjectOverLimit):

1107 with excutils.save_and_reraise_exception():

1108 self.report_client.delete_allocation_for_instance(

1109 context, instance.uuid, force=True)

1110

1111 host_list = host_lists[0]

1112 selection = host_list[0]

1113 scheduler_utils.populate_filter_properties(

1114 filter_properties, selection)

1115 (host, node) = (selection.service_host, selection.nodename)

1116 LOG.debug(

1117 "Scheduler selected host: %s, node:%s",

1118 host,

1119 node,

1120 instance=instance

1121 )

1122 instance.availability_zone = (

1123 availability_zones.get_host_availability_zone(

1124 context, host))

1125

1126 scheduler_utils.fill_provider_mapping(

1127 request_spec, selection)

1128

1129 # NOTE(brinzhang): For unshelve operation we should

1130 # re-create-and-bound the arqs for the instance.

1131 accel_uuids = self._create_and_bind_arq_for_instance(

1132 context, instance, node, request_spec)

1133 self.compute_rpcapi.unshelve_instance(

1134 context, instance, host, request_spec, image=image,

1135 filter_properties=filter_properties, node=node,

1136 accel_uuids=accel_uuids)

1137 except (exception.NoValidHost,

1138 exception.UnsupportedPolicyException):

1139 instance.task_state = None

1140 instance.save()

1141 LOG.warning("No valid host found for unshelve instance",

1142 instance=instance)

1143 return

1144 except Exception as exc:

1145 if isinstance(exc, exception.AcceleratorRequestBindingFailed):

1146 cyclient = cyborg.get_client(context)

1147 cyclient.delete_arqs_by_uuid(exc.arqs)

1148 LOG.exception('Failed to unshelve. Reason: %s', exc)

1149 with excutils.save_and_reraise_exception():

1150 instance.task_state = None

1151 instance.save()

1152 LOG.error("Unshelve attempted but an error "

1153 "has occurred", instance=instance)

1154 else:

1155 LOG.error('Unshelve attempted but vm_state not SHELVED or '

1156 'SHELVED_OFFLOADED', instance=instance)

1157 instance.vm_state = vm_states.ERROR

1158 instance.save()

1159 return

1160

1161 def _allocate_for_evacuate_dest_host(self, context, instance, host,

1162 request_spec=None):

1163 # The user is forcing the destination host and bypassing the

1164 # scheduler. We need to copy the source compute node

1165 # allocations in Placement to the destination compute node.

1166 # Normally select_destinations() in the scheduler would do this

1167 # for us, but when forcing the target host we don't call the

1168 # scheduler.

1169 source_node = None # This is used for error handling below.

1170 try:

1171 source_node = objects.ComputeNode.get_by_host_and_nodename(

1172 context, instance.host, instance.node)

1173 dest_node = (

1174 objects.ComputeNode.get_first_node_by_host_for_old_compat(

1175 context, host, use_slave=True))

1176 except exception.ComputeHostNotFound as ex:

1177 with excutils.save_and_reraise_exception():

1178 self._set_vm_state_and_notify(

1179 context, instance.uuid, 'rebuild_server',

1180 {'vm_state': instance.vm_state,

1181 'task_state': None}, ex, request_spec)

1182 if source_node:

1183 LOG.warning('Specified host %s for evacuate was not '

1184 'found.', host, instance=instance)

1185 else:

1186 LOG.warning('Source host %s and node %s for evacuate was '

1187 'not found.', instance.host, instance.node,

1188 instance=instance)

1189

1190 try:

1191 scheduler_utils.claim_resources_on_destination(

1192 context, self.report_client, instance, source_node, dest_node)

1193 except exception.NoValidHost as ex:

1194 with excutils.save_and_reraise_exception():

1195 self._set_vm_state_and_notify(

1196 context, instance.uuid, 'rebuild_server',

1197 {'vm_state': instance.vm_state,

1198 'task_state': None}, ex, request_spec)

1199 LOG.warning('Specified host %s for evacuate is '

1200 'invalid.', host, instance=instance)

1201

1202 # TODO(mriedem): Make request_spec required in ComputeTaskAPI RPC v2.0.

1203 @targets_cell

1204 def rebuild_instance(self, context, instance, orig_image_ref, image_ref,

1205 injected_files, new_pass, orig_sys_metadata,

1206 bdms, recreate, on_shared_storage,

1207 preserve_ephemeral=False, host=None,

1208 request_spec=None, reimage_boot_volume=False,

1209 target_state=None):

1210 # recreate=True means the instance is being evacuated from a failed

1211 # host to a new destination host. The 'recreate' variable name is

1212 # confusing, so rename it to evacuate here at the top, which is simpler

1213 # than renaming a parameter in an RPC versioned method.

1214 evacuate = recreate

1215

1216 # NOTE(efried): It would be nice if this were two separate events, one

1217 # for 'rebuild' and one for 'evacuate', but this is part of the API

1218 # now, so it would be nontrivial to change.

1219 with compute_utils.EventReporter(context, 'rebuild_server',

1220 self.host, instance.uuid):

1221 node = limits = None

1222

1223 try:

1224 migration = objects.Migration.get_by_instance_and_status(

1225 context, instance.uuid, 'accepted')

1226 except exception.MigrationNotFoundByStatus:

1227 LOG.debug("No migration record for the rebuild/evacuate "

1228 "request.", instance=instance)

1229 migration = None

1230

1231 # The host variable is passed in two cases:

1232 # 1. rebuild - the instance.host is passed to rebuild on the

1233 # same host and bypass the scheduler *unless* a new image

1234 # was specified

1235 # 2. evacuate with specified host and force=True - the specified

1236 # host is passed and is meant to bypass the scheduler.

1237 # NOTE(mriedem): This could be a lot more straight-forward if we

1238 # had separate methods for rebuild and evacuate...

1239 if host:

1240 # We only create a new allocation on the specified host if

1241 # we're doing an evacuate since that is a move operation.

1242 if host != instance.host: 1242 ↛ 1245line 1242 didn't jump to line 1245 because the condition on line 1242 was never true

1243 # If a destination host is forced for evacuate, create

1244 # allocations against it in Placement.

1245 try:

1246 self._allocate_for_evacuate_dest_host(

1247 context, instance, host, request_spec)

1248 except exception.AllocationUpdateFailed as ex:

1249 with excutils.save_and_reraise_exception():

1250 if migration:

1251 migration.status = 'error'

1252 migration.save()

1253 # NOTE(efried): It would be nice if this were two

1254 # separate events, one for 'rebuild' and one for

1255 # 'evacuate', but this is part of the API now, so

1256 # it would be nontrivial to change.

1257 self._set_vm_state_and_notify(

1258 context,

1259 instance.uuid,

1260 'rebuild_server',

1261 {'vm_state': vm_states.ERROR,

1262 'task_state': None}, ex, request_spec)

1263 LOG.warning('Rebuild failed: %s',

1264 str(ex), instance=instance)

1265 except exception.NoValidHost:

1266 with excutils.save_and_reraise_exception():

1267 if migration:

1268 migration.status = 'error'

1269 migration.save()

1270 else:

1271 # At this point, the user is either:

1272 #

1273 # 1. Doing a rebuild on the same host (not evacuate) and

1274 # specified a new image.

1275 # 2. Evacuating and specified a host but are not forcing it.

1276 #

1277 # In either case, the API passes host=None but sets up the

1278 # RequestSpec.requested_destination field for the specified

1279 # host.

1280 if evacuate:

1281 # NOTE(sbauza): Augment the RequestSpec object by excluding

1282 # the source host for avoiding the scheduler to pick it

1283 request_spec.ignore_hosts = [instance.host]

1284 # NOTE(sbauza): Force_hosts/nodes needs to be reset

1285 # if we want to make sure that the next destination

1286 # is not forced to be the original host

1287 request_spec.reset_forced_destinations()

1288 res_req, req_lvl_params = (

1289 self.network_api.get_requested_resource_for_instance(

1290 context, instance.uuid)

1291 )

1292 extra_specs = request_spec.flavor.extra_specs

1293 device_profile = extra_specs.get('accel:device_profile')

1294 res_req.extend(

1295 cyborg.get_device_profile_request_groups(

1296 context, device_profile)

1297 if device_profile else [])

1298 # NOTE(gibi): When other modules want to handle similar

1299 # non-nova resources then here we have to collect all

1300 # the external resource requests in a single list and

1301 # add them to the RequestSpec.

1302 request_spec.requested_resources = res_req

1303 request_spec.request_level_params = req_lvl_params

1304

1305 # NOTE(gibi): as PCI devices is tracked in placement we

1306 # need to generate request groups from InstancePCIRequests.

1307 # This will append new RequestGroup objects to the

1308 # request_spec.requested_resources list if needed

1309 request_spec.generate_request_groups_from_pci_requests()

1310

1311 try:

1312 # if this is a rebuild of instance on the same host with

1313 # new image.

1314 if not evacuate and orig_image_ref != image_ref:

1315 self._validate_image_traits_for_rebuild(context,

1316 instance,

1317 image_ref)

1318 self._restrict_request_spec_to_cell(

1319 context, instance, request_spec)

1320 request_spec.ensure_project_and_user_id(instance)

1321 request_spec.ensure_network_information(instance)

1322 compute_utils.heal_reqspec_is_bfv(

1323 context, request_spec, instance)

1324

1325 host_lists = self._schedule_instances(context,

1326 request_spec, [instance.uuid],

1327 return_alternates=False)

1328 host_list = host_lists[0]

1329 selection = host_list[0]

1330 host, node, limits = (selection.service_host,

1331 selection.nodename, selection.limits)

1332

1333 if recreate:

1334 scheduler_utils.fill_provider_mapping(

1335 request_spec, selection)

1336

1337 except (exception.NoValidHost,

1338 exception.UnsupportedPolicyException,

1339 exception.AllocationUpdateFailed,

1340 # the next two can come from fill_provider_mapping and

1341 # signals a software error.

1342 NotImplementedError,

1343 ValueError) as ex:

1344 if migration:

1345 migration.status = 'error'

1346 migration.save()

1347 # Rollback the image_ref if a new one was provided (this

1348 # only happens in the rebuild case, not evacuate).

1349 if orig_image_ref and orig_image_ref != image_ref: 1349 ↛ 1352line 1349 didn't jump to line 1352 because the condition on line 1349 was always true

1350 instance.image_ref = orig_image_ref

1351 instance.save()

1352 with excutils.save_and_reraise_exception():

1353 # NOTE(efried): It would be nice if this were two

1354 # separate events, one for 'rebuild' and one for

1355 # 'evacuate', but this is part of the API now, so it

1356 # would be nontrivial to change.

1357 self._set_vm_state_and_notify(context, instance.uuid,

1358 'rebuild_server',

1359 {'vm_state': vm_states.ERROR,

1360 'task_state': None}, ex, request_spec)

1361 LOG.warning('Rebuild failed: %s',

1362 str(ex), instance=instance)

1363

1364 compute_utils.notify_about_instance_usage(

1365 self.notifier, context, instance, "rebuild.scheduled")

1366 compute_utils.notify_about_instance_rebuild(

1367 context, instance, host,

1368 action=fields.NotificationAction.REBUILD_SCHEDULED,

1369 source=fields.NotificationSource.CONDUCTOR)

1370

1371 instance.availability_zone = (

1372 availability_zones.get_host_availability_zone(

1373 context, host))

1374 accel_uuids = []

1375 try:

1376 if instance.flavor.extra_specs.get('accel:device_profile'):

1377 cyclient = cyborg.get_client(context)

1378 if evacuate:

1379 # NOTE(brinzhang): For evacuate operation we should

1380 # delete the bound arqs, then re-create-and-bound the

1381 # arqs for the instance.

1382 cyclient.delete_arqs_for_instance(instance.uuid)

1383 accel_uuids = self._create_and_bind_arq_for_instance(

1384 context, instance, node, request_spec)

1385 else:

1386 accel_uuids = cyclient.get_arq_uuids_for_instance(

1387 instance)

1388 except Exception as exc:

1389 if isinstance(exc, exception.AcceleratorRequestBindingFailed): 1389 ↛ 1392line 1389 didn't jump to line 1392 because the condition on line 1389 was always true

1390 cyclient = cyborg.get_client(context)

1391 cyclient.delete_arqs_by_uuid(exc.arqs)

1392 LOG.exception('Failed to rebuild. Reason: %s', exc)

1393 raise exc

1394

1395 self.compute_rpcapi.rebuild_instance(

1396 context,

1397 instance=instance,

1398 new_pass=new_pass,

1399 injected_files=injected_files,

1400 image_ref=image_ref,

1401 orig_image_ref=orig_image_ref,

1402 orig_sys_metadata=orig_sys_metadata,

1403 bdms=bdms,

1404 recreate=evacuate,

1405 on_shared_storage=on_shared_storage,

1406 preserve_ephemeral=preserve_ephemeral,

1407 migration=migration,

1408 host=host,

1409 node=node,

1410 limits=limits,

1411 request_spec=request_spec,

1412 accel_uuids=accel_uuids,

1413 reimage_boot_volume=reimage_boot_volume,

1414 target_state=target_state)

1415

1416 def _validate_image_traits_for_rebuild(self, context, instance, image_ref):

1417 """Validates that the traits specified in the image can be satisfied

1418 by the providers of the current allocations for the instance during

1419 rebuild of the instance. If the traits cannot be

1420 satisfied, fails the action by raising a NoValidHost exception.

1421

1422 :raises: NoValidHost exception in case the traits on the providers

1423 of the allocated resources for the instance do not match

1424 the required traits on the image.

1425 """

1426 image_meta = objects.ImageMeta.from_image_ref(

1427 context, self.image_api, image_ref)

1428 if ('properties' not in image_meta or 1428 ↛ 1433line 1428 didn't jump to line 1433 because the condition on line 1428 was always true

1429 'traits_required' not in image_meta.properties or not

1430 image_meta.properties.traits_required):

1431 return

1432

1433 image_traits = set(image_meta.properties.traits_required)

1434

1435 # check any of the image traits are forbidden in flavor traits.

1436 # if so raise an exception

1437 extra_specs = instance.flavor.extra_specs

1438 forbidden_flavor_traits = set()

1439 for key, val in extra_specs.items():

1440 if key.startswith('trait'):

1441 # get the actual key.

1442 prefix, parsed_key = key.split(':', 1)

1443 if val == 'forbidden':

1444 forbidden_flavor_traits.add(parsed_key)

1445

1446 forbidden_traits = image_traits & forbidden_flavor_traits

1447

1448 if forbidden_traits:

1449 raise exception.NoValidHost(

1450 reason=_("Image traits are part of forbidden "

1451 "traits in flavor associated with the server. "

1452 "Either specify a different image during rebuild "

1453 "or create a new server with the specified image "

1454 "and a compatible flavor."))

1455

1456 # If image traits are present, then validate against allocations.

1457 allocations = self.report_client.get_allocations_for_consumer(

1458 context, instance.uuid)

1459 instance_rp_uuids = list(allocations)

1460

1461 # Get provider tree for the instance. We use the uuid of the host

1462 # on which the instance is rebuilding to get the provider tree.

1463 compute_node = objects.ComputeNode.get_by_host_and_nodename(

1464 context, instance.host, instance.node)

1465

1466 # TODO(karimull): Call with a read-only version, when available.

1467 instance_rp_tree = (

1468 self.report_client.get_provider_tree_and_ensure_root(

1469 context, compute_node.uuid))

1470

1471 traits_in_instance_rps = set()

1472

1473 for rp_uuid in instance_rp_uuids:

1474 traits_in_instance_rps.update(

1475 instance_rp_tree.data(rp_uuid).traits)

1476

1477 missing_traits = image_traits - traits_in_instance_rps

1478

1479 if missing_traits:

1480 raise exception.NoValidHost(

1481 reason=_("Image traits cannot be "

1482 "satisfied by the current resource providers. "

1483 "Either specify a different image during rebuild "

1484 "or create a new server with the specified image."))

1485

1486 # TODO(avolkov): move method to bdm

1487 @staticmethod

1488 def _volume_size(flavor, bdm):

1489 size = bdm.get('volume_size')

1490 # NOTE (ndipanov): inherit flavor size only for swap and ephemeral

1491 if (size is None and bdm.get('source_type') == 'blank' and 1491 ↛ 1493line 1491 didn't jump to line 1493 because the condition on line 1491 was never true

1492 bdm.get('destination_type') == 'local'):

1493 if bdm.get('guest_format') == 'swap':

1494 size = flavor.get('swap', 0)

1495 else:

1496 size = flavor.get('ephemeral_gb', 0)

1497 return size

1498

1499 def _create_block_device_mapping(self, cell, flavor, instance_uuid,

1500 block_device_mapping):

1501 """Create the BlockDeviceMapping objects in the db.

1502

1503 This method makes a copy of the list in order to avoid using the same

1504 id field in case this is called for multiple instances.

1505 """

1506 LOG.debug("block_device_mapping %s", list(block_device_mapping),

1507 instance_uuid=instance_uuid)

1508 instance_block_device_mapping = copy.deepcopy(block_device_mapping)

1509 for bdm in instance_block_device_mapping:

1510 bdm.volume_size = self._volume_size(flavor, bdm)

1511 bdm.instance_uuid = instance_uuid

1512 with obj_target_cell(bdm, cell):

1513 bdm.update_or_create()

1514 return instance_block_device_mapping

1515

1516 def _create_tags(self, context, instance_uuid, tags):

1517 """Create the Tags objects in the db."""

1518 if tags:

1519 tag_list = [tag.tag for tag in tags]

1520 instance_tags = objects.TagList.create(

1521 context, instance_uuid, tag_list)

1522 return instance_tags

1523 else:

1524 return tags

1525

1526 def _create_instance_action_for_cell0(self, context, instance, exc):

1527 """Create a failed "create" instance action for the instance in cell0.

1528

1529 :param context: nova auth RequestContext targeted at cell0

1530 :param instance: Instance object being buried in cell0

1531 :param exc: Exception that occurred which resulted in burial

1532 """

1533 # First create the action record.

1534 objects.InstanceAction.action_start(

1535 context, instance.uuid, instance_actions.CREATE, want_result=False)

1536 # Now create an event for that action record.

1537 event_name = 'conductor_schedule_and_build_instances'

1538 objects.InstanceActionEvent.event_start(

1539 context, instance.uuid, event_name, want_result=False,

1540 host=self.host)

1541 # And finish the event with the exception. Note that we expect this

1542 # method to be called from _bury_in_cell0 which is called from within

1543 # an exception handler so sys.exc_info should return values but if not

1544 # it's not the end of the world - this is best effort.

1545 objects.InstanceActionEvent.event_finish_with_failure(

1546 context, instance.uuid, event_name, exc_val=exc,

1547 exc_tb=sys.exc_info()[2], want_result=False)

1548

1549 def _bury_in_cell0(self, context, request_spec, exc,

1550 build_requests=None, instances=None,

1551 block_device_mapping=None,

1552 tags=None):

1553 """Ensure all provided build_requests and instances end up in cell0.

1554

1555 Cell0 is the fake cell we schedule dead instances to when we can't

1556 schedule them somewhere real. Requests that don't yet have instances

1557 will get a new instance, created in cell0. Instances that have not yet

1558 been created will be created in cell0. All build requests are destroyed

1559 after we're done. Failure to delete a build request will trigger the

1560 instance deletion, just like the happy path in

1561 schedule_and_build_instances() below.

1562 """

1563 try:

1564 cell0 = objects.CellMapping.get_by_uuid(

1565 context, objects.CellMapping.CELL0_UUID)

1566 except exception.CellMappingNotFound:

1567 # Not yet setup for cellsv2. Instances will need to be written

1568 # to the configured database. This will become a deployment

1569 # error in Ocata.

1570 LOG.error('No cell mapping found for cell0 while '

1571 'trying to record scheduling failure. '

1572 'Setup is incomplete.')

1573 return

1574

1575 build_requests = build_requests or []

1576 instances = instances or []

1577 instances_by_uuid = {inst.uuid: inst for inst in instances}

1578 for build_request in build_requests:

1579 if build_request.instance_uuid not in instances_by_uuid:

1580 # This is an instance object with no matching db entry.

1581 instance = build_request.get_new_instance(context)

1582 instances_by_uuid[instance.uuid] = instance

1583

1584 updates = {'vm_state': vm_states.ERROR, 'task_state': None}

1585 for instance in instances_by_uuid.values():

1586

1587 inst_mapping = None

1588 try:

1589 # We don't need the cell0-targeted context here because the

1590 # instance mapping is in the API DB.

1591 inst_mapping = \

1592 objects.InstanceMapping.get_by_instance_uuid(

1593 context, instance.uuid)

1594 except exception.InstanceMappingNotFound:

1595 # The API created the instance mapping record so it should

1596 # definitely be here. Log an error but continue to create the

1597 # instance in the cell0 database.

1598 LOG.error('While burying instance in cell0, no instance '

1599 'mapping was found.', instance=instance)

1600

1601 # Perform a final sanity check that the instance is not mapped

1602 # to some other cell already because of maybe some crazy

1603 # clustered message queue weirdness.

1604 if inst_mapping and inst_mapping.cell_mapping is not None:

1605 LOG.error('When attempting to bury instance in cell0, the '

1606 'instance is already mapped to cell %s. Ignoring '

1607 'bury in cell0 attempt.',

1608 inst_mapping.cell_mapping.identity,

1609 instance=instance)

1610 continue

1611

1612 with obj_target_cell(instance, cell0) as cctxt:

1613 instance.create()

1614 if inst_mapping:

1615 inst_mapping.cell_mapping = cell0

1616 inst_mapping.save()

1617

1618 # Record an instance action with a failed event.

1619 self._create_instance_action_for_cell0(

1620 cctxt, instance, exc)

1621

1622 # NOTE(mnaser): In order to properly clean-up volumes after

1623 # being buried in cell0, we need to store BDMs.

1624 if block_device_mapping:

1625 self._create_block_device_mapping(

1626 cell0, instance.flavor, instance.uuid,

1627 block_device_mapping)

1628

1629 self._create_tags(cctxt, instance.uuid, tags)

1630

1631 # Use the context targeted to cell0 here since the instance is

1632 # now in cell0.

1633 self._set_vm_state_and_notify(

1634 cctxt, instance.uuid, 'build_instances', updates,

1635 exc, request_spec)

1636

1637 for build_request in build_requests:

1638 try:

1639 build_request.destroy()

1640 except exception.BuildRequestNotFound:

1641 # Instance was deleted before we finished scheduling

1642 inst = instances_by_uuid[build_request.instance_uuid]

1643 with obj_target_cell(inst, cell0):

1644 inst.destroy()

1645

1646 def schedule_and_build_instances(self, context, build_requests,

1647 request_specs, image,

1648 admin_password, injected_files,

1649 requested_networks, block_device_mapping,

1650 tags=None):

1651 # Add all the UUIDs for the instances

1652 instance_uuids = [spec.instance_uuid for spec in request_specs]

1653 try:

1654 host_lists = self._schedule_instances(context, request_specs[0],

1655 instance_uuids, return_alternates=True)

1656 except Exception as exc:

1657 LOG.exception('Failed to schedule instances')

1658 self._bury_in_cell0(context, request_specs[0], exc,

1659 build_requests=build_requests,

1660 block_device_mapping=block_device_mapping,

1661 tags=tags)

1662 return

1663

1664 host_mapping_cache = {}

1665 cell_mapping_cache = {}

1666 instances = []

1667 host_az = {} # host=az cache to optimize multi-create

1668

1669 for (build_request, request_spec, host_list) in zip(

1670 build_requests, request_specs, host_lists):

1671 instance = build_request.get_new_instance(context)

1672 # host_list is a list of one or more Selection objects, the first

1673 # of which has been selected and its resources claimed.

1674 host = host_list[0]

1675 # Convert host from the scheduler into a cell record

1676 if host.service_host not in host_mapping_cache:

1677 try:

1678 host_mapping = objects.HostMapping.get_by_host(

1679 context, host.service_host)

1680 host_mapping_cache[host.service_host] = host_mapping

1681 except exception.HostMappingNotFound as exc:

1682 LOG.error('No host-to-cell mapping found for selected '

1683 'host %(host)s. Setup is incomplete.',

1684 {'host': host.service_host})

1685 self._bury_in_cell0(

1686 context, request_spec, exc,

1687 build_requests=[build_request], instances=[instance],

1688 block_device_mapping=block_device_mapping,

1689 tags=tags)

1690 # This is a placeholder in case the quota recheck fails.

1691 instances.append(None)

1692 continue

1693 else:

1694 host_mapping = host_mapping_cache[host.service_host]

1695

1696 cell = host_mapping.cell_mapping

1697

1698 # Before we create the instance, let's make one final check that

1699 # the build request is still around and wasn't deleted by the user

1700 # already.

1701 try:

1702 objects.BuildRequest.get_by_instance_uuid(

1703 context, instance.uuid)

1704 except exception.BuildRequestNotFound:

1705 # the build request is gone so we're done for this instance

1706 LOG.debug('While scheduling instance, the build request '

1707 'was already deleted.', instance=instance)

1708 # This is a placeholder in case the quota recheck fails.

1709 instances.append(None)

1710 # If the build request was deleted and the instance is not

1711 # going to be created, there is on point in leaving an orphan

1712 # instance mapping so delete it.

1713 try:

1714 im = objects.InstanceMapping.get_by_instance_uuid(

1715 context, instance.uuid)

1716 im.destroy()

1717 except exception.InstanceMappingNotFound:

1718 pass

1719 self.report_client.delete_allocation_for_instance(

1720 context, instance.uuid, force=True)

1721 continue

1722 else:

1723 if host.service_host not in host_az:

1724 host_az[host.service_host] = (

1725 availability_zones.get_host_availability_zone(

1726 context, host.service_host))

1727 instance.availability_zone = host_az[host.service_host]

1728 with obj_target_cell(instance, cell):

1729 instance.create()

1730 instances.append(instance)

1731 cell_mapping_cache[instance.uuid] = cell

1732

1733 # NOTE(melwitt): We recheck the quota after allocating the

1734 # resources to prevent users from allocating more resources

1735 # than their allowed quota in the event of a race. This is

1736 # configurable because it can be expensive if strict quota

1737 # limits are not required in a deployment.

1738 try:

1739 self._recheck_quota(context, instance.flavor, request_specs[0],

1740 len(build_requests), project_id=instance.project_id,

1741 user_id=instance.user_id

1742 )

1743 except (exception.TooManyInstances,

1744 limit_exceptions.ProjectOverLimit) as exc:

1745 with excutils.save_and_reraise_exception():

1746 self._cleanup_build_artifacts(

1747 context, exc, instances, build_requests, request_specs,

1748 block_device_mapping, tags, cell_mapping_cache)

1749

1750 zipped = zip(build_requests, request_specs, host_lists, instances)

1751 for (build_request, request_spec, host_list, instance) in zipped:

1752 if instance is None:

1753 # Skip placeholders that were buried in cell0 or had their

1754 # build requests deleted by the user before instance create.

1755 continue

1756 cell = cell_mapping_cache[instance.uuid]

1757 # host_list is a list of one or more Selection objects, the first

1758 # of which has been selected and its resources claimed.

1759 host = host_list.pop(0)

1760 alts = [(alt.service_host, alt.nodename) for alt in host_list]

1761 LOG.debug("Selected host: %s; Selected node: %s; Alternates: %s",

1762 host.service_host, host.nodename, alts, instance=instance)

1763 filter_props = request_spec.to_legacy_filter_properties_dict()

1764 scheduler_utils.populate_retry(filter_props, instance.uuid)

1765 scheduler_utils.populate_filter_properties(filter_props,

1766 host)

1767

1768 # Now that we have a selected host (which has claimed resource

1769 # allocations in the scheduler) for this instance, we may need to

1770 # map allocations to resource providers in the request spec.

1771 try:

1772 scheduler_utils.fill_provider_mapping(request_spec, host)

1773 except Exception as exc:

1774 # If anything failed here we need to cleanup and bail out.

1775 with excutils.save_and_reraise_exception():

1776 self._cleanup_build_artifacts(

1777 context, exc, instances, build_requests, request_specs,

1778 block_device_mapping, tags, cell_mapping_cache)

1779

1780 # TODO(melwitt): Maybe we should set_target_cell on the contexts

1781 # once we map to a cell, and remove these separate with statements.

1782 with obj_target_cell(instance, cell) as cctxt:

1783 # send a state update notification for the initial create to

1784 # show it going from non-existent to BUILDING

1785 # This can lazy-load attributes on instance.

1786 notifications.send_update_with_states(cctxt, instance, None,

1787 vm_states.BUILDING, None, None, service="conductor")

1788 objects.InstanceAction.action_start(

1789 cctxt, instance.uuid, instance_actions.CREATE,

1790 want_result=False)

1791 instance_bdms = self._create_block_device_mapping(

1792 cell, instance.flavor, instance.uuid, block_device_mapping)

1793 instance_tags = self._create_tags(cctxt, instance.uuid, tags)

1794

1795 # TODO(Kevin Zheng): clean this up once instance.create() handles

1796 # tags; we do this so the instance.create notification in

1797 # build_and_run_instance in nova-compute doesn't lazy-load tags

1798 instance.tags = instance_tags if instance_tags \

1799 else objects.TagList()

1800

1801 # Update mapping for instance.

1802 self._map_instance_to_cell(context, instance, cell)

1803

1804 if not self._delete_build_request(

1805 context, build_request, instance, cell, instance_bdms,

1806 instance_tags):

1807 # The build request was deleted before/during scheduling so

1808 # the instance is gone and we don't have anything to build for

1809 # this one.

1810 continue

1811

1812 try:

1813 accel_uuids = self._create_and_bind_arq_for_instance(

1814 context, instance, host.nodename, request_spec,

1815 requested_networks)

1816 except Exception as exc:

1817 with excutils.save_and_reraise_exception():

1818 self._cleanup_build_artifacts(

1819 context, exc, instances, build_requests, request_specs,

1820 block_device_mapping, tags, cell_mapping_cache)

1821

1822 # NOTE(danms): Compute RPC expects security group names or ids

1823 # not objects, so convert this to a list of names until we can

1824 # pass the objects.

1825 legacy_secgroups = [s.identifier

1826 for s in request_spec.security_groups]

1827 with obj_target_cell(instance, cell) as cctxt:

1828 self.compute_rpcapi.build_and_run_instance(

1829 cctxt, instance=instance, image=image,

1830 request_spec=request_spec,

1831 filter_properties=filter_props,

1832 admin_password=admin_password,

1833 injected_files=injected_files,

1834 requested_networks=requested_networks,

1835 security_groups=legacy_secgroups,

1836 block_device_mapping=instance_bdms,

1837 host=host.service_host, node=host.nodename,

1838 limits=host.limits, host_list=host_list,

1839 accel_uuids=accel_uuids)

1840

1841 def _create_and_bind_arqs(

1842 self, cyclient, instance_uuid, extra_specs,

1843 hostname, resource_provider_mapping):

1844 """Create ARQs comes from extra specs, determine their RPs.

1845

1846 The binding is asynchronous; Cyborg will notify on completion.

1847 The notification will be handled in the compute manager.

1848 """

1849 arqs = []

1850 bindings = {}

1851 dp_name = extra_specs.get('accel:device_profile')

1852

1853 # profiles from request spec: Create ARQ and binding

1854 if not dp_name:

1855 # empty arq list and binding info

1856 return bindings

1857

1858 LOG.debug('Calling Cyborg to get ARQs. dp_name=%s instance=%s',

1859 dp_name, instance_uuid)

1860 arqs = cyclient.create_arqs_and_match_resource_providers(

1861 dp_name, resource_provider_mapping)

1862 LOG.debug('Got ARQs with resource provider mapping %s', arqs)

1863 bindings = {

1864 arq['uuid']:

1865 {"hostname": hostname,

1866 "device_rp_uuid": arq['device_rp_uuid'],

1867 "instance_uuid": instance_uuid

1868 }

1869 for arq in arqs}

1870

1871 return bindings

1872

1873 def _create_arqs_for_ports(self, cyclient, instance_uuid,

1874 requested_networks,

1875 hostname, resource_provider_mapping):

1876 """Create ARQs for port with backend device profile.

1877

1878 The binding is asynchronous; Cyborg will notify on completion.

1879 The notification will be handled in the compute manager.

1880 """

1881 bindings = {}

1882

1883 for request_net in requested_networks:

1884 if request_net.port_id and request_net.device_profile: 1884 ↛ 1883line 1884 didn't jump to line 1883 because the condition on line 1884 was always true

1885 device_profile = request_net.device_profile

1886 # the port doesn't support multiple devices

1887 arqs = cyclient.create_arqs(device_profile)

1888 if len(arqs) > 1:

1889 raise exception.AcceleratorRequestOpFailed(

1890 op=_('create'),

1891 msg='the port does not support multiple devices.')

1892 arq = arqs[0]

1893

1894 LOG.debug("Create ARQ %s for port %s of instance %s",

1895 arq["uuid"], request_net.port_id, instance_uuid)

1896 request_net.arq_uuid = arq["uuid"]

1897

1898 rp_uuid = cyclient.get_arq_device_rp_uuid(

1899 arq,

1900 resource_provider_mapping,

1901 request_net.port_id)

1902

1903 arq_binding = {request_net.arq_uuid:

1904 {"hostname": hostname,

1905 "device_rp_uuid": rp_uuid,

1906 "instance_uuid": instance_uuid}

1907 }

1908 LOG.debug("ARQ %s binding: %s", request_net.arq_uuid,

1909 arq_binding)

1910 bindings.update(arq_binding)

1911

1912 return bindings

1913

1914 @staticmethod

1915 def _map_instance_to_cell(context, instance, cell):

1916 """Update the instance mapping to point at the given cell.

1917

1918 During initial scheduling once a host and cell is selected in which

1919 to build the instance this method is used to update the instance

1920 mapping to point at that cell.

1921

1922 :param context: nova auth RequestContext

1923 :param instance: Instance object being built

1924 :param cell: CellMapping representing the cell in which the instance

1925 was created and is being built.

1926 :returns: InstanceMapping object that was updated.

1927 """

1928 inst_mapping = objects.InstanceMapping.get_by_instance_uuid(

1929 context, instance.uuid)

1930 # Perform a final sanity check that the instance is not mapped

1931 # to some other cell already because of maybe some crazy

1932 # clustered message queue weirdness.

1933 if inst_mapping.cell_mapping is not None:

1934 LOG.error('During scheduling instance is already mapped to '

1935 'another cell: %s. This should not happen and is an '

1936 'indication of bigger problems. If you see this you '

1937 'should report it to the nova team. Overwriting '

1938 'the mapping to point at cell %s.',

1939 inst_mapping.cell_mapping.identity, cell.identity,

1940 instance=instance)

1941 inst_mapping.cell_mapping = cell

1942 inst_mapping.save()

1943 return inst_mapping

1944

1945 def _cleanup_build_artifacts(self, context, exc, instances, build_requests,

1946 request_specs, block_device_mappings, tags,

1947 cell_mapping_cache):

1948 for (instance, build_request, request_spec) in zip(

1949 instances, build_requests, request_specs):

1950 # Skip placeholders that were buried in cell0 or had their

1951 # build requests deleted by the user before instance create.

1952 if instance is None: 1952 ↛ 1953line 1952 didn't jump to line 1953 because the condition on line 1952 was never true

1953 continue

1954 updates = {'vm_state': vm_states.ERROR, 'task_state': None}

1955 cell = cell_mapping_cache[instance.uuid]

1956 with try_target_cell(context, cell) as cctxt:

1957 self._set_vm_state_and_notify(cctxt, instance.uuid,

1958 'build_instances', updates, exc,

1959 request_spec)

1960

1961 # In order to properly clean-up volumes when deleting a server in

1962 # ERROR status with no host, we need to store BDMs in the same

1963 # cell.

1964 if block_device_mappings: 1964 ↛ 1972line 1964 didn't jump to line 1972 because the condition on line 1964 was always true

1965 self._create_block_device_mapping(

1966 cell, instance.flavor, instance.uuid,

1967 block_device_mappings)

1968

1969 # Like BDMs, the server tags provided by the user when creating the

1970 # server should be persisted in the same cell so they can be shown

1971 # from the API.

1972 if tags: 1972 ↛ 1983line 1972 didn't jump to line 1983 because the condition on line 1972 was always true

1973 with nova_context.target_cell(context, cell) as cctxt:

1974 self._create_tags(cctxt, instance.uuid, tags)

1975

1976 # NOTE(mdbooth): To avoid an incomplete instance record being

1977 # returned by the API, the instance mapping must be

1978 # created after the instance record is complete in

1979 # the cell, and before the build request is

1980 # destroyed.

1981 # TODO(mnaser): The cell mapping should already be populated by

1982 # this point to avoid setting it below here.

1983 inst_mapping = objects.InstanceMapping.get_by_instance_uuid(

1984 context, instance.uuid)

1985 inst_mapping.cell_mapping = cell

1986 inst_mapping.save()

1987

1988 # Be paranoid about artifacts being deleted underneath us.

1989 try:

1990 build_request.destroy()

1991 except exception.BuildRequestNotFound:

1992 pass

1993 try:

1994 request_spec.destroy()

1995 except exception.RequestSpecNotFound:

1996 pass

1997

1998 def _delete_build_request(self, context, build_request, instance, cell,

1999 instance_bdms, instance_tags):

2000 """Delete a build request after creating the instance in the cell.

2001

2002 This method handles cleaning up the instance in case the build request

2003 is already deleted by the time we try to delete it.

2004

2005 :param context: the context of the request being handled

2006 :type context: nova.context.RequestContext

2007 :param build_request: the build request to delete

2008 :type build_request: nova.objects.BuildRequest

2009 :param instance: the instance created from the build_request

2010 :type instance: nova.objects.Instance

2011 :param cell: the cell in which the instance was created

2012 :type cell: nova.objects.CellMapping

2013 :param instance_bdms: list of block device mappings for the instance

2014 :type instance_bdms: nova.objects.BlockDeviceMappingList

2015 :param instance_tags: list of tags for the instance

2016 :type instance_tags: nova.objects.TagList

2017 :returns: True if the build request was successfully deleted, False if

2018 the build request was already deleted and the instance is now gone.

2019 """

2020 try:

2021 build_request.destroy()

2022 except exception.BuildRequestNotFound:

2023 # This indicates an instance deletion request has been

2024 # processed, and the build should halt here. Clean up the

2025 # bdm, tags and instance record.

2026 with obj_target_cell(instance, cell) as cctxt:

2027 with compute_utils.notify_about_instance_delete(

2028 self.notifier, cctxt, instance,

2029 source=fields.NotificationSource.CONDUCTOR):

2030 try:

2031 instance.destroy()

2032 except exception.InstanceNotFound:

2033 pass

2034 except exception.ObjectActionError:

2035 # NOTE(melwitt): Instance became scheduled during

2036 # the destroy, "host changed". Refresh and re-destroy.

2037 try:

2038 instance.refresh()

2039 instance.destroy()

2040 except exception.InstanceNotFound:

2041 pass

2042 for bdm in instance_bdms:

2043 with obj_target_cell(bdm, cell):

2044 try:

2045 bdm.destroy()

2046 except exception.ObjectActionError:

2047 pass

2048 if instance_tags: 2048 ↛ 2054line 2048 didn't jump to line 2054 because the condition on line 2048 was always true

2049 with try_target_cell(context, cell) as target_ctxt:

2050 try:

2051 objects.TagList.destroy(target_ctxt, instance.uuid)

2052 except exception.InstanceNotFound:

2053 pass

2054 return False

2055 return True

2056

2057 def cache_images(self, context, aggregate, image_ids):

2058 """Cache a set of images on the set of hosts in an aggregate.

2059

2060 :param context: The RequestContext

2061 :param aggregate: The Aggregate object from the request to constrain

2062 the host list

2063 :param image_id: The IDs of the image to cache

2064 """

2065

2066 # TODO(mriedem): Consider including the list of images in the

2067 # notification payload.

2068 compute_utils.notify_about_aggregate_action(

2069 context, aggregate,

2070 fields.NotificationAction.IMAGE_CACHE,

2071 fields.NotificationPhase.START)

2072

2073 clock = timeutils.StopWatch()

2074 threads = CONF.image_cache.precache_concurrency

2075 fetch_pool = eventlet.GreenPool(size=threads)

2076

2077 hosts_by_cell = {}

2078 cells_by_uuid = {}

2079 # TODO(danms): Make this a much more efficient bulk query

2080 for hostname in aggregate.hosts:

2081 hmap = objects.HostMapping.get_by_host(context, hostname)

2082 cells_by_uuid.setdefault(hmap.cell_mapping.uuid, hmap.cell_mapping)

2083 hosts_by_cell.setdefault(hmap.cell_mapping.uuid, [])

2084 hosts_by_cell[hmap.cell_mapping.uuid].append(hostname)

2085

2086 LOG.info('Preparing to request pre-caching of image(s) %(image_ids)s '

2087 'on %(hosts)i hosts across %(cells)i cells.',

2088 {'image_ids': ','.join(image_ids),

2089 'hosts': len(aggregate.hosts),

2090 'cells': len(hosts_by_cell)})

2091 clock.start()

2092

2093 stats = collections.defaultdict(lambda: (0, 0, 0, 0))

2094 failed_images = collections.defaultdict(int)

2095 down_hosts = set()

2096 host_stats = {

2097 'completed': 0,

2098 'total': len(aggregate.hosts),

2099 }

2100

2101 def host_completed(context, host, result):

2102 for image_id, status in result.items():

2103 cached, existing, error, unsupported = stats[image_id]

2104 if status == 'error':

2105 failed_images[image_id] += 1

2106 error += 1

2107 elif status == 'cached': 2107 ↛ 2108line 2107 didn't jump to line 2108 because the condition on line 2107 was never true

2108 cached += 1

2109 elif status == 'existing': 2109 ↛ 2110line 2109 didn't jump to line 2110 because the condition on line 2109 was never true

2110 existing += 1

2111 elif status == 'unsupported':

2112 unsupported += 1

2113 stats[image_id] = (cached, existing, error, unsupported)

2114

2115 host_stats['completed'] += 1

2116 compute_utils.notify_about_aggregate_cache(context, aggregate,

2117 host, result,

2118 host_stats['completed'],

2119 host_stats['total'])

2120

2121 def wrap_cache_images(ctxt, host, image_ids):

2122 result = self.compute_rpcapi.cache_images(

2123 ctxt,

2124 host=host,

2125 image_ids=image_ids)

2126 host_completed(context, host, result)

2127

2128 def skipped_host(context, host, image_ids):

2129 result = {image: 'skipped' for image in image_ids}

2130 host_completed(context, host, result)

2131

2132 for cell_uuid, hosts in hosts_by_cell.items():

2133 cell = cells_by_uuid[cell_uuid]

2134 with nova_context.target_cell(context, cell) as target_ctxt:

2135 for host in hosts:

2136 service = objects.Service.get_by_compute_host(target_ctxt,

2137 host)

2138 if not self.servicegroup_api.service_is_up(service):

2139 down_hosts.add(host)

2140 LOG.info(

2141 'Skipping image pre-cache request to compute '

2142 '%(host)r because it is not up',

2143 {'host': host})

2144 skipped_host(target_ctxt, host, image_ids)

2145 continue

2146

2147 utils.pass_context(fetch_pool.spawn_n, wrap_cache_images,

2148 target_ctxt, host, image_ids)

2149

2150 # Wait until all those things finish

2151 fetch_pool.waitall()

2152

2153 overall_stats = {'cached': 0, 'existing': 0, 'error': 0,

2154 'unsupported': 0}

2155 for cached, existing, error, unsupported in stats.values():

2156 overall_stats['cached'] += cached

2157 overall_stats['existing'] += existing

2158 overall_stats['error'] += error

2159 overall_stats['unsupported'] += unsupported

2160

2161 clock.stop()

2162 LOG.info('Image pre-cache operation for image(s) %(image_ids)s '

2163 'completed in %(time).2f seconds; '

2164 '%(cached)i cached, %(existing)i existing, %(error)i errors, '

2165 '%(unsupported)i unsupported, %(skipped)i skipped (down) '

2166 'hosts',

2167 {'image_ids': ','.join(image_ids),

2168 'time': clock.elapsed(),

2169 'cached': overall_stats['cached'],

2170 'existing': overall_stats['existing'],

2171 'error': overall_stats['error'],

2172 'unsupported': overall_stats['unsupported'],

2173 'skipped': len(down_hosts),

2174 })

2175 # Log error'd images specifically at warning level

2176 for image_id, fails in failed_images.items():

2177 LOG.warning('Image pre-cache operation for image %(image)s '

2178 'failed %(fails)i times',

2179 {'image': image_id,

2180 'fails': fails})

2181

2182 compute_utils.notify_about_aggregate_action(

2183 context, aggregate,

2184 fields.NotificationAction.IMAGE_CACHE,

2185 fields.NotificationPhase.END)

2186

2187 @targets_cell

2188 @wrap_instance_event(prefix='conductor')

2189 def confirm_snapshot_based_resize(self, context, instance, migration):

2190 """Executes the ConfirmResizeTask

2191

2192 :param context: nova auth request context targeted at the target cell

2193 :param instance: Instance object in "resized" status from the target

2194 cell

2195 :param migration: Migration object from the target cell for the resize

2196 operation expected to have status "confirming"

2197 """

2198 task = cross_cell_migrate.ConfirmResizeTask(

2199 context, instance, migration, self.notifier, self.compute_rpcapi)

2200 task.execute()

2201

2202 @targets_cell

2203 # NOTE(mriedem): Upon successful completion of RevertResizeTask the

2204 # instance is hard-deleted, along with its instance action record(s), from

2205 # the target cell database so EventReporter hits InstanceActionNotFound on

2206 # __exit__. Pass graceful_exit=True to avoid an ugly traceback.

2207 @wrap_instance_event(prefix='conductor', graceful_exit=True)

2208 def revert_snapshot_based_resize(self, context, instance, migration):

2209 """Executes the RevertResizeTask

2210

2211 :param context: nova auth request context targeted at the target cell

2212 :param instance: Instance object in "resized" status from the target

2213 cell

2214 :param migration: Migration object from the target cell for the resize

2215 operation expected to have status "reverting"

2216 """

2217 task = cross_cell_migrate.RevertResizeTask(

2218 context, instance, migration, self.notifier, self.compute_rpcapi)

2219 task.execute()