Coverage for nova/conductor/tasks/migrate.py: 90%
151 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
1# Licensed under the Apache License, Version 2.0 (the "License"); you may
2# not use this file except in compliance with the License. You may obtain
3# a copy of the License at
4#
5# http://www.apache.org/licenses/LICENSE-2.0
6#
7# Unless required by applicable law or agreed to in writing, software
8# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
9# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
10# License for the specific language governing permissions and limitations
11# under the License.
13from oslo_log import log as logging
14from oslo_serialization import jsonutils
16from nova import availability_zones
17from nova.compute import utils as compute_utils
18from nova.conductor.tasks import base
19from nova.conductor.tasks import cross_cell_migrate
20from nova import exception
21from nova.i18n import _
22from nova import objects
23from nova.scheduler.client import report
24from nova.scheduler import utils as scheduler_utils
26LOG = logging.getLogger(__name__)
29def replace_allocation_with_migration(context, instance, migration):
30 """Replace instance's allocation with one for a migration.
32 :raises: keystoneauth1.exceptions.base.ClientException on failure to
33 communicate with the placement API
34 :raises: ConsumerAllocationRetrievalFailed if reading the current
35 allocation from placement fails
36 :raises: ComputeHostNotFound if the host of the instance is not found in
37 the database
38 :raises: AllocationMoveFailed if moving the allocation from the
39 instance.uuid to the migration.uuid fails due to parallel
40 placement operation on the instance consumer
41 :raises: NoValidHost if placement rejects the update for other reasons
42 (e.g. not enough resources)
43 :returns: (source_compute_node, migration_allocation)
44 """
45 try:
46 source_cn = objects.ComputeNode.get_by_host_and_nodename(
47 context, instance.host, instance.node)
48 except exception.ComputeHostNotFound:
49 LOG.error('Unable to find record for source '
50 'node %(node)s on %(host)s',
51 {'host': instance.host, 'node': instance.node},
52 instance=instance)
53 # A generic error like this will just error out the migration
54 # and do any rollback required
55 raise
57 reportclient = report.report_client_singleton()
59 orig_alloc = reportclient.get_allocs_for_consumer(
60 context, instance.uuid)['allocations']
61 root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {})
62 if not root_alloc:
63 # TODO(stephenfin): This was a valid code path when there was support
64 # for multiple schedulers, but it should probably be an error now
65 LOG.debug(
66 'Unable to find existing allocations for instance on '
67 'source compute node: %s',
68 source_cn.uuid, instance=instance)
69 return None, None
71 # FIXME(gibi): This method is flawed in that it does not handle allocations
72 # against sharing providers in any special way. This leads to duplicate
73 # allocations against the sharing provider during migration.
74 success = reportclient.move_allocations(context, instance.uuid,
75 migration.uuid)
76 if not success:
77 LOG.error('Unable to replace resource claim on source '
78 'host %(host)s node %(node)s for instance',
79 {'host': instance.host,
80 'node': instance.node},
81 instance=instance)
82 # Mimic the "no space" error that could have come from the
83 # scheduler. Once we have an atomic replace operation, this
84 # would be a severe error.
85 raise exception.NoValidHost(
86 reason=_('Unable to replace instance claim on source'))
87 else:
88 LOG.debug('Created allocations for migration %(mig)s on %(rp)s',
89 {'mig': migration.uuid, 'rp': source_cn.uuid})
91 return source_cn, orig_alloc
94def revert_allocation_for_migration(context, source_cn, instance, migration):
95 """Revert an allocation made for a migration back to the instance."""
97 reportclient = report.report_client_singleton()
99 # FIXME(gibi): This method is flawed in that it does not handle allocations
100 # against sharing providers in any special way. This leads to duplicate
101 # allocations against the sharing provider during migration.
102 success = reportclient.move_allocations(context, migration.uuid,
103 instance.uuid)
104 if not success:
105 LOG.error('Unable to replace resource claim on source '
106 'host %(host)s node %(node)s for instance',
107 {'host': instance.host,
108 'node': instance.node},
109 instance=instance)
110 else:
111 LOG.debug('Created allocations for instance %(inst)s on %(rp)s',
112 {'inst': instance.uuid, 'rp': source_cn.uuid})
115class MigrationTask(base.TaskBase):
116 def __init__(self, context, instance, flavor,
117 request_spec, clean_shutdown, compute_rpcapi,
118 query_client, report_client, host_list, network_api):
119 super(MigrationTask, self).__init__(context, instance)
120 self.clean_shutdown = clean_shutdown
121 self.request_spec = request_spec
122 self.flavor = flavor
124 self.compute_rpcapi = compute_rpcapi
125 self.query_client = query_client
126 self.reportclient = report_client
127 self.host_list = host_list
128 self.network_api = network_api
130 # Persist things from the happy path so we don't have to look
131 # them up if we need to roll back
132 self._migration = None
133 self._held_allocations = None
134 self._source_cn = None
136 def _preallocate_migration(self):
137 # If this is a rescheduled migration, don't create a new record.
138 migration_type = ("resize" if self.instance.flavor.id != self.flavor.id
139 else "migration")
140 filters = {"instance_uuid": self.instance.uuid,
141 "migration_type": migration_type,
142 "status": "pre-migrating"}
143 migrations = objects.MigrationList.get_by_filters(self.context,
144 filters).objects
145 if migrations: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true
146 migration = migrations[0]
147 else:
148 migration = objects.Migration(context=self.context.elevated())
149 migration.old_instance_type_id = self.instance.flavor.id
150 migration.new_instance_type_id = self.flavor.id
151 migration.status = 'pre-migrating'
152 migration.instance_uuid = self.instance.uuid
153 migration.source_compute = self.instance.host
154 migration.source_node = self.instance.node
155 migration.migration_type = migration_type
156 migration.create()
158 self._migration = migration
160 self._source_cn, self._held_allocations = (
161 replace_allocation_with_migration(self.context,
162 self.instance,
163 self._migration))
165 return migration
167 def _set_requested_destination_cell(self, legacy_props):
168 instance_mapping = objects.InstanceMapping.get_by_instance_uuid(
169 self.context, self.instance.uuid)
170 if not ('requested_destination' in self.request_spec and
171 self.request_spec.requested_destination):
172 self.request_spec.requested_destination = objects.Destination()
173 targeted = 'host' in self.request_spec.requested_destination
174 # NOTE(mriedem): If the user is allowed to perform a cross-cell resize
175 # then add the current cell to the request spec as "preferred" so the
176 # scheduler will (by default) weigh hosts within the current cell over
177 # hosts in another cell, all other things being equal. If the user is
178 # not allowed to perform cross-cell resize, then we limit the request
179 # spec and tell the scheduler to only look at hosts in the current
180 # cell.
181 cross_cell_allowed = (
182 self.request_spec.requested_destination.allow_cross_cell_move)
183 if targeted and cross_cell_allowed:
184 # If a target host is specified it might be in another cell so
185 # we cannot restrict the cell in this case. We would not prefer
186 # the source cell in that case either since we know where the
187 # user wants it to go. We just let the scheduler figure it out.
188 self.request_spec.requested_destination.cell = None
189 else:
190 self.request_spec.requested_destination.cell = (
191 instance_mapping.cell_mapping)
193 # NOTE(takashin): In the case that the target host is specified,
194 # if the migration is failed, it is not necessary to retry
195 # the cold migration to the same host. So make sure that
196 # reschedule will not occur.
197 if targeted:
198 legacy_props.pop('retry', None)
199 self.request_spec.retry = None
201 # Log our plan before calling the scheduler.
202 if cross_cell_allowed and targeted:
203 LOG.debug('Not restricting cell for targeted cold migration.',
204 instance=self.instance)
205 elif cross_cell_allowed:
206 LOG.debug('Allowing migration from cell %(cell)s',
207 {'cell': instance_mapping.cell_mapping.identity},
208 instance=self.instance)
209 else:
210 LOG.debug('Restricting to cell %(cell)s while migrating',
211 {'cell': instance_mapping.cell_mapping.identity},
212 instance=self.instance)
214 def _is_selected_host_in_source_cell(self, selection):
215 """Checks if the given Selection is in the same cell as the instance
217 :param selection: Selection object returned from the scheduler
218 ``select_destinations`` method.
219 :returns: True if the host Selection is in the same cell as the
220 instance, False otherwise.
221 """
222 # Note that the context is already targeted to the current cell in
223 # which the instance exists.
224 same_cell = selection.cell_uuid == self.context.cell_uuid
225 if not same_cell:
226 LOG.debug('Selected target host %s is in cell %s and instance is '
227 'in cell: %s', selection.service_host,
228 selection.cell_uuid, self.context.cell_uuid,
229 instance=self.instance)
230 return same_cell
232 def _execute(self):
233 # NOTE(sbauza): Force_hosts/nodes needs to be reset if we want to make
234 # sure that the next destination is not forced to be the original host.
235 # This needs to be done before the populate_retry call otherwise
236 # retries will be disabled if the server was created with a forced
237 # host/node.
238 self.request_spec.reset_forced_destinations()
240 # TODO(sbauza): Remove once all the scheduler.utils methods accept a
241 # RequestSpec object in the signature.
242 legacy_props = self.request_spec.to_legacy_filter_properties_dict()
243 scheduler_utils.setup_instance_group(self.context, self.request_spec)
244 # If a target host is set in a requested destination,
245 # 'populate_retry' need not be executed.
246 if not ('requested_destination' in self.request_spec and
247 self.request_spec.requested_destination and
248 'host' in self.request_spec.requested_destination):
249 scheduler_utils.populate_retry(legacy_props,
250 self.instance.uuid)
252 port_res_req, req_lvl_params = (
253 self.network_api.get_requested_resource_for_instance(
254 self.context, self.instance.uuid)
255 )
256 # NOTE(gibi): When cyborg or other module wants to handle similar
257 # non-nova resources then here we have to collect all the external
258 # resource requests in a single list and add them to the RequestSpec.
259 self.request_spec.requested_resources = port_res_req
260 self.request_spec.request_level_params = req_lvl_params
261 # NOTE(gibi): as PCI devices is tracked in placement we need to
262 # generate request groups from InstancePCIRequests. This will append
263 # new RequestGroup objects to the request_spec.requested_resources list
264 # if needed
265 self.request_spec.generate_request_groups_from_pci_requests()
267 self._set_requested_destination_cell(legacy_props)
269 # Once _preallocate_migration() is done, the source node allocation is
270 # moved from the instance consumer to the migration record consumer,
271 # and the instance consumer doesn't have any allocations. If this is
272 # the first time through here (not a reschedule), select_destinations
273 # below will allocate resources on the selected destination node for
274 # the instance consumer. If we're rescheduling, host_list is not None
275 # and we'll call claim_resources for the instance and the selected
276 # alternate. If we exhaust our alternates and raise MaxRetriesExceeded,
277 # the rollback() method should revert the allocation swaparoo and move
278 # the source node allocation from the migration record back to the
279 # instance record.
280 migration = self._preallocate_migration()
282 self.request_spec.ensure_project_and_user_id(self.instance)
283 self.request_spec.ensure_network_information(self.instance)
284 compute_utils.heal_reqspec_is_bfv(
285 self.context, self.request_spec, self.instance)
286 # On an initial call to migrate, 'self.host_list' will be None, so we
287 # have to call the scheduler to get a list of acceptable hosts to
288 # migrate to. That list will consist of a selected host, along with
289 # zero or more alternates. On a reschedule, though, the alternates will
290 # be passed to this object and stored in 'self.host_list', so we can
291 # pop the first alternate from the list to use for the destination, and
292 # pass the remaining alternates to the compute.
293 if self.host_list is None: 293 ↛ 310line 293 didn't jump to line 310 because the condition on line 293 was always true
294 selection = self._schedule()
295 if not self._is_selected_host_in_source_cell(selection):
296 # If the selected host is in another cell, we need to execute
297 # another task to do the cross-cell migration.
298 LOG.info('Executing cross-cell resize task starting with '
299 'target host: %s', selection.service_host,
300 instance=self.instance)
301 task = cross_cell_migrate.CrossCellMigrationTask(
302 self.context, self.instance, self.flavor,
303 self.request_spec, self._migration, self.compute_rpcapi,
304 selection, self.host_list)
305 task.execute()
306 return
307 else:
308 # This is a reschedule that will use the supplied alternate hosts
309 # in the host_list as destinations.
310 selection = self._reschedule()
312 scheduler_utils.populate_filter_properties(legacy_props, selection)
314 (host, node) = (selection.service_host, selection.nodename)
316 # The availability_zone field was added in v1.1 of the Selection
317 # object so make sure to handle the case where it is missing.
318 if 'availability_zone' in selection: 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true
319 self.instance.availability_zone = selection.availability_zone
320 else:
321 self.instance.availability_zone = (
322 availability_zones.get_host_availability_zone(
323 self.context, host))
325 LOG.debug("Calling prep_resize with selected host: %s; "
326 "Selected node: %s; Alternates: %s", host, node,
327 self.host_list, instance=self.instance)
328 # RPC cast to the destination host to start the migration process.
329 self.compute_rpcapi.prep_resize(
330 # NOTE(mriedem): Using request_spec.image here is potentially
331 # dangerous if it is not kept up to date (i.e. rebuild/unshelve);
332 # seems like the sane thing to do would be to pass the current
333 # instance.image_meta since that is what MoveClaim will use for
334 # any NUMA topology claims on the destination host...
335 self.context, self.instance, self.request_spec.image,
336 self.flavor, host, migration,
337 request_spec=self.request_spec, filter_properties=legacy_props,
338 node=node, clean_shutdown=self.clean_shutdown,
339 host_list=self.host_list)
341 def _schedule(self):
342 selection_lists = self.query_client.select_destinations(
343 self.context, self.request_spec, [self.instance.uuid],
344 return_objects=True, return_alternates=True)
345 # Since there is only ever one instance to migrate per call, we
346 # just need the first returned element.
347 selection_list = selection_lists[0]
349 # Scheduler allocated resources on the first host so try that first
350 selection, self.host_list = selection_list[0], selection_list[1:]
352 scheduler_utils.fill_provider_mapping(self.request_spec, selection)
353 return selection
355 def _reschedule(self):
356 # Since the resources on these alternates may have been consumed and
357 # might not be able to support the migrated instance, we need to first
358 # claim the resources to verify the host still has sufficient
359 # available resources.
360 elevated = self.context.elevated()
361 host_available = False
362 selection = None
363 while self.host_list and not host_available:
364 selection = self.host_list.pop(0)
365 if selection.allocation_request: 365 ↛ 368line 365 didn't jump to line 368 because the condition on line 365 was always true
366 alloc_req = jsonutils.loads(selection.allocation_request)
367 else:
368 alloc_req = None
369 if alloc_req: 369 ↛ 385line 369 didn't jump to line 385 because the condition on line 369 was always true
370 # If this call succeeds, the resources on the destination
371 # host will be claimed by the instance.
372 host_available = scheduler_utils.claim_resources(
373 elevated, self.reportclient, self.request_spec,
374 self.instance.uuid, alloc_req,
375 selection.allocation_request_version)
376 if host_available:
377 scheduler_utils.fill_provider_mapping(
378 self.request_spec, selection)
379 else:
380 # Some deployments use different schedulers that do not
381 # use Placement, so they will not have an
382 # allocation_request to claim with. For those cases,
383 # there is no concept of claiming, so just assume that
384 # the host is valid.
385 host_available = True
386 # There are no more available hosts. Raise a MaxRetriesExceeded
387 # exception in that case.
388 if not host_available:
389 reason = ("Exhausted all hosts available for retrying build "
390 "failures for instance %(instance_uuid)s." %
391 {"instance_uuid": self.instance.uuid})
392 raise exception.MaxRetriesExceeded(reason=reason)
393 return selection
395 def rollback(self, ex):
396 if self._migration: 396 ↛ 400line 396 didn't jump to line 400 because the condition on line 396 was always true
397 self._migration.status = 'error'
398 self._migration.save()
400 if not self._held_allocations: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true
401 return
403 # NOTE(danms): We created new-style migration-based
404 # allocations for the instance, but failed before we kicked
405 # off the migration in the compute. Normally the latter would
406 # do that cleanup but we never got that far, so do it here and
407 # now.
409 revert_allocation_for_migration(self.context, self._source_cn,
410 self.instance, self._migration)