Coverage for nova/conductor/tasks/migrate.py: 90%

1# Licensed under the Apache License, Version 2.0 (the "License"); you may

2# not use this file except in compliance with the License. You may obtain

3# a copy of the License at

5# http://www.apache.org/licenses/LICENSE-2.0

7# Unless required by applicable law or agreed to in writing, software

8# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

9# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

10# License for the specific language governing permissions and limitations

11# under the License.

13from oslo_log import log as logging

14from oslo_serialization import jsonutils

16from nova import availability_zones

17from nova.compute import utils as compute_utils

18from nova.conductor.tasks import base

19from nova.conductor.tasks import cross_cell_migrate

20from nova import exception

21from nova.i18n import _

22from nova import objects

23from nova.scheduler.client import report

24from nova.scheduler import utils as scheduler_utils

26LOG = logging.getLogger(__name__)

29def replace_allocation_with_migration(context, instance, migration):

30 """Replace instance's allocation with one for a migration.

32 :raises: keystoneauth1.exceptions.base.ClientException on failure to

33 communicate with the placement API

34 :raises: ConsumerAllocationRetrievalFailed if reading the current

35 allocation from placement fails

36 :raises: ComputeHostNotFound if the host of the instance is not found in

37 the database

38 :raises: AllocationMoveFailed if moving the allocation from the

39 instance.uuid to the migration.uuid fails due to parallel

40 placement operation on the instance consumer

41 :raises: NoValidHost if placement rejects the update for other reasons

42 (e.g. not enough resources)

43 :returns: (source_compute_node, migration_allocation)

44 """

45 try:

46 source_cn = objects.ComputeNode.get_by_host_and_nodename(

47 context, instance.host, instance.node)

48 except exception.ComputeHostNotFound:

49 LOG.error('Unable to find record for source '

50 'node %(node)s on %(host)s',

51 {'host': instance.host, 'node': instance.node},

52 instance=instance)

53 # A generic error like this will just error out the migration

54 # and do any rollback required

55 raise

57 reportclient = report.report_client_singleton()

59 orig_alloc = reportclient.get_allocs_for_consumer(

60 context, instance.uuid)['allocations']

61 root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {})

62 if not root_alloc:

63 # TODO(stephenfin): This was a valid code path when there was support

64 # for multiple schedulers, but it should probably be an error now

65 LOG.debug(

66 'Unable to find existing allocations for instance on '

67 'source compute node: %s',

68 source_cn.uuid, instance=instance)

69 return None, None

71 # FIXME(gibi): This method is flawed in that it does not handle allocations

72 # against sharing providers in any special way. This leads to duplicate

73 # allocations against the sharing provider during migration.

74 success = reportclient.move_allocations(context, instance.uuid,

75 migration.uuid)

76 if not success:

77 LOG.error('Unable to replace resource claim on source '

78 'host %(host)s node %(node)s for instance',

79 {'host': instance.host,

80 'node': instance.node},

81 instance=instance)

82 # Mimic the "no space" error that could have come from the

83 # scheduler. Once we have an atomic replace operation, this

84 # would be a severe error.

85 raise exception.NoValidHost(

86 reason=_('Unable to replace instance claim on source'))

87 else:

88 LOG.debug('Created allocations for migration %(mig)s on %(rp)s',

89 {'mig': migration.uuid, 'rp': source_cn.uuid})

91 return source_cn, orig_alloc

94def revert_allocation_for_migration(context, source_cn, instance, migration):

95 """Revert an allocation made for a migration back to the instance."""

97 reportclient = report.report_client_singleton()

99 # FIXME(gibi): This method is flawed in that it does not handle allocations

100 # against sharing providers in any special way. This leads to duplicate

101 # allocations against the sharing provider during migration.

102 success = reportclient.move_allocations(context, migration.uuid,

103 instance.uuid)

104 if not success:

105 LOG.error('Unable to replace resource claim on source '

106 'host %(host)s node %(node)s for instance',

107 {'host': instance.host,

108 'node': instance.node},

109 instance=instance)

110 else:

111 LOG.debug('Created allocations for instance %(inst)s on %(rp)s',

112 {'inst': instance.uuid, 'rp': source_cn.uuid})

113

114

115class MigrationTask(base.TaskBase):

116 def __init__(self, context, instance, flavor,

117 request_spec, clean_shutdown, compute_rpcapi,

118 query_client, report_client, host_list, network_api):

119 super(MigrationTask, self).__init__(context, instance)

120 self.clean_shutdown = clean_shutdown

121 self.request_spec = request_spec

122 self.flavor = flavor

123

124 self.compute_rpcapi = compute_rpcapi

125 self.query_client = query_client

126 self.reportclient = report_client

127 self.host_list = host_list

128 self.network_api = network_api

129

130 # Persist things from the happy path so we don't have to look

131 # them up if we need to roll back

132 self._migration = None

133 self._held_allocations = None

134 self._source_cn = None

135

136 def _preallocate_migration(self):

137 # If this is a rescheduled migration, don't create a new record.

138 migration_type = ("resize" if self.instance.flavor.id != self.flavor.id

139 else "migration")

140 filters = {"instance_uuid": self.instance.uuid,

141 "migration_type": migration_type,

142 "status": "pre-migrating"}

143 migrations = objects.MigrationList.get_by_filters(self.context,

144 filters).objects

145 if migrations: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 migration = migrations[0]

147 else:

148 migration = objects.Migration(context=self.context.elevated())

149 migration.old_instance_type_id = self.instance.flavor.id

150 migration.new_instance_type_id = self.flavor.id

151 migration.status = 'pre-migrating'

152 migration.instance_uuid = self.instance.uuid

153 migration.source_compute = self.instance.host

154 migration.source_node = self.instance.node

155 migration.migration_type = migration_type

156 migration.create()

157

158 self._migration = migration

159

160 self._source_cn, self._held_allocations = (

161 replace_allocation_with_migration(self.context,

162 self.instance,

163 self._migration))

164

165 return migration

166

167 def _set_requested_destination_cell(self, legacy_props):

168 instance_mapping = objects.InstanceMapping.get_by_instance_uuid(

169 self.context, self.instance.uuid)

170 if not ('requested_destination' in self.request_spec and

171 self.request_spec.requested_destination):

172 self.request_spec.requested_destination = objects.Destination()

173 targeted = 'host' in self.request_spec.requested_destination

174 # NOTE(mriedem): If the user is allowed to perform a cross-cell resize

175 # then add the current cell to the request spec as "preferred" so the

176 # scheduler will (by default) weigh hosts within the current cell over

177 # hosts in another cell, all other things being equal. If the user is

178 # not allowed to perform cross-cell resize, then we limit the request

179 # spec and tell the scheduler to only look at hosts in the current

180 # cell.

181 cross_cell_allowed = (

182 self.request_spec.requested_destination.allow_cross_cell_move)

183 if targeted and cross_cell_allowed:

184 # If a target host is specified it might be in another cell so

185 # we cannot restrict the cell in this case. We would not prefer

186 # the source cell in that case either since we know where the

187 # user wants it to go. We just let the scheduler figure it out.

188 self.request_spec.requested_destination.cell = None

189 else:

190 self.request_spec.requested_destination.cell = (

191 instance_mapping.cell_mapping)

192

193 # NOTE(takashin): In the case that the target host is specified,

194 # if the migration is failed, it is not necessary to retry

195 # the cold migration to the same host. So make sure that

196 # reschedule will not occur.

197 if targeted:

198 legacy_props.pop('retry', None)

199 self.request_spec.retry = None

200

201 # Log our plan before calling the scheduler.

202 if cross_cell_allowed and targeted:

203 LOG.debug('Not restricting cell for targeted cold migration.',

204 instance=self.instance)

205 elif cross_cell_allowed:

206 LOG.debug('Allowing migration from cell %(cell)s',

207 {'cell': instance_mapping.cell_mapping.identity},

208 instance=self.instance)

209 else:

210 LOG.debug('Restricting to cell %(cell)s while migrating',

211 {'cell': instance_mapping.cell_mapping.identity},

212 instance=self.instance)

213

214 def _is_selected_host_in_source_cell(self, selection):

215 """Checks if the given Selection is in the same cell as the instance

216

217 :param selection: Selection object returned from the scheduler

218 ``select_destinations`` method.

219 :returns: True if the host Selection is in the same cell as the

220 instance, False otherwise.

221 """

222 # Note that the context is already targeted to the current cell in

223 # which the instance exists.

224 same_cell = selection.cell_uuid == self.context.cell_uuid

225 if not same_cell:

226 LOG.debug('Selected target host %s is in cell %s and instance is '

227 'in cell: %s', selection.service_host,

228 selection.cell_uuid, self.context.cell_uuid,

229 instance=self.instance)

230 return same_cell

231

232 def _execute(self):

233 # NOTE(sbauza): Force_hosts/nodes needs to be reset if we want to make

234 # sure that the next destination is not forced to be the original host.

235 # This needs to be done before the populate_retry call otherwise

236 # retries will be disabled if the server was created with a forced

237 # host/node.

238 self.request_spec.reset_forced_destinations()

239

240 # TODO(sbauza): Remove once all the scheduler.utils methods accept a

241 # RequestSpec object in the signature.

242 legacy_props = self.request_spec.to_legacy_filter_properties_dict()

243 scheduler_utils.setup_instance_group(self.context, self.request_spec)

244 # If a target host is set in a requested destination,

245 # 'populate_retry' need not be executed.

246 if not ('requested_destination' in self.request_spec and

247 self.request_spec.requested_destination and

248 'host' in self.request_spec.requested_destination):

249 scheduler_utils.populate_retry(legacy_props,

250 self.instance.uuid)

251

252 port_res_req, req_lvl_params = (

253 self.network_api.get_requested_resource_for_instance(

254 self.context, self.instance.uuid)

255 )

256 # NOTE(gibi): When cyborg or other module wants to handle similar

257 # non-nova resources then here we have to collect all the external

258 # resource requests in a single list and add them to the RequestSpec.

259 self.request_spec.requested_resources = port_res_req

260 self.request_spec.request_level_params = req_lvl_params

261 # NOTE(gibi): as PCI devices is tracked in placement we need to

262 # generate request groups from InstancePCIRequests. This will append

263 # new RequestGroup objects to the request_spec.requested_resources list

264 # if needed

265 self.request_spec.generate_request_groups_from_pci_requests()

266

267 self._set_requested_destination_cell(legacy_props)

268

269 # Once _preallocate_migration() is done, the source node allocation is

270 # moved from the instance consumer to the migration record consumer,

271 # and the instance consumer doesn't have any allocations. If this is

272 # the first time through here (not a reschedule), select_destinations

273 # below will allocate resources on the selected destination node for

274 # the instance consumer. If we're rescheduling, host_list is not None

275 # and we'll call claim_resources for the instance and the selected

276 # alternate. If we exhaust our alternates and raise MaxRetriesExceeded,

277 # the rollback() method should revert the allocation swaparoo and move

278 # the source node allocation from the migration record back to the

279 # instance record.

280 migration = self._preallocate_migration()

281

282 self.request_spec.ensure_project_and_user_id(self.instance)

283 self.request_spec.ensure_network_information(self.instance)

284 compute_utils.heal_reqspec_is_bfv(

285 self.context, self.request_spec, self.instance)

286 # On an initial call to migrate, 'self.host_list' will be None, so we

287 # have to call the scheduler to get a list of acceptable hosts to

288 # migrate to. That list will consist of a selected host, along with

289 # zero or more alternates. On a reschedule, though, the alternates will

290 # be passed to this object and stored in 'self.host_list', so we can

291 # pop the first alternate from the list to use for the destination, and

292 # pass the remaining alternates to the compute.

293 if self.host_list is None: 293 ↛ 310line 293 didn't jump to line 310 because the condition on line 293 was always true

294 selection = self._schedule()

295 if not self._is_selected_host_in_source_cell(selection):

296 # If the selected host is in another cell, we need to execute

297 # another task to do the cross-cell migration.

298 LOG.info('Executing cross-cell resize task starting with '

299 'target host: %s', selection.service_host,

300 instance=self.instance)

301 task = cross_cell_migrate.CrossCellMigrationTask(

302 self.context, self.instance, self.flavor,

303 self.request_spec, self._migration, self.compute_rpcapi,

304 selection, self.host_list)

305 task.execute()

306 return

307 else:

308 # This is a reschedule that will use the supplied alternate hosts

309 # in the host_list as destinations.

310 selection = self._reschedule()

311

312 scheduler_utils.populate_filter_properties(legacy_props, selection)

313

314 (host, node) = (selection.service_host, selection.nodename)

315

316 # The availability_zone field was added in v1.1 of the Selection

317 # object so make sure to handle the case where it is missing.

318 if 'availability_zone' in selection: 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true

319 self.instance.availability_zone = selection.availability_zone

320 else:

321 self.instance.availability_zone = (

322 availability_zones.get_host_availability_zone(

323 self.context, host))

324

325 LOG.debug("Calling prep_resize with selected host: %s; "

326 "Selected node: %s; Alternates: %s", host, node,

327 self.host_list, instance=self.instance)

328 # RPC cast to the destination host to start the migration process.

329 self.compute_rpcapi.prep_resize(

330 # NOTE(mriedem): Using request_spec.image here is potentially

331 # dangerous if it is not kept up to date (i.e. rebuild/unshelve);

332 # seems like the sane thing to do would be to pass the current

333 # instance.image_meta since that is what MoveClaim will use for

334 # any NUMA topology claims on the destination host...

335 self.context, self.instance, self.request_spec.image,

336 self.flavor, host, migration,

337 request_spec=self.request_spec, filter_properties=legacy_props,

338 node=node, clean_shutdown=self.clean_shutdown,

339 host_list=self.host_list)

340

341 def _schedule(self):

342 selection_lists = self.query_client.select_destinations(

343 self.context, self.request_spec, [self.instance.uuid],

344 return_objects=True, return_alternates=True)

345 # Since there is only ever one instance to migrate per call, we

346 # just need the first returned element.

347 selection_list = selection_lists[0]

348

349 # Scheduler allocated resources on the first host so try that first

350 selection, self.host_list = selection_list[0], selection_list[1:]

351

352 scheduler_utils.fill_provider_mapping(self.request_spec, selection)

353 return selection

354

355 def _reschedule(self):

356 # Since the resources on these alternates may have been consumed and

357 # might not be able to support the migrated instance, we need to first

358 # claim the resources to verify the host still has sufficient

359 # available resources.

360 elevated = self.context.elevated()

361 host_available = False

362 selection = None

363 while self.host_list and not host_available:

364 selection = self.host_list.pop(0)

365 if selection.allocation_request: 365 ↛ 368line 365 didn't jump to line 368 because the condition on line 365 was always true

366 alloc_req = jsonutils.loads(selection.allocation_request)

367 else:

368 alloc_req = None

369 if alloc_req: 369 ↛ 385line 369 didn't jump to line 385 because the condition on line 369 was always true

370 # If this call succeeds, the resources on the destination

371 # host will be claimed by the instance.

372 host_available = scheduler_utils.claim_resources(

373 elevated, self.reportclient, self.request_spec,

374 self.instance.uuid, alloc_req,

375 selection.allocation_request_version)

376 if host_available:

377 scheduler_utils.fill_provider_mapping(

378 self.request_spec, selection)

379 else:

380 # Some deployments use different schedulers that do not

381 # use Placement, so they will not have an

382 # allocation_request to claim with. For those cases,

383 # there is no concept of claiming, so just assume that

384 # the host is valid.

385 host_available = True

386 # There are no more available hosts. Raise a MaxRetriesExceeded

387 # exception in that case.

388 if not host_available:

389 reason = ("Exhausted all hosts available for retrying build "

390 "failures for instance %(instance_uuid)s." %

391 {"instance_uuid": self.instance.uuid})

392 raise exception.MaxRetriesExceeded(reason=reason)

393 return selection

394

395 def rollback(self, ex):

396 if self._migration: 396 ↛ 400line 396 didn't jump to line 400 because the condition on line 396 was always true

397 self._migration.status = 'error'

398 self._migration.save()

399

400 if not self._held_allocations: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 return

402

403 # NOTE(danms): We created new-style migration-based

404 # allocations for the instance, but failed before we kicked

405 # off the migration in the compute. Normally the latter would

406 # do that cleanup but we never got that far, so do it here and

407 # now.

408

409 revert_allocation_for_migration(self.context, self._source_cn,

410 self.instance, self._migration)