Coverage for nova/conductor/tasks/migrate.py: 90%

151 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-24 11:16 +0000

1# Licensed under the Apache License, Version 2.0 (the "License"); you may 

2# not use this file except in compliance with the License. You may obtain 

3# a copy of the License at 

4# 

5# http://www.apache.org/licenses/LICENSE-2.0 

6# 

7# Unless required by applicable law or agreed to in writing, software 

8# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

9# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

10# License for the specific language governing permissions and limitations 

11# under the License. 

12 

13from oslo_log import log as logging 

14from oslo_serialization import jsonutils 

15 

16from nova import availability_zones 

17from nova.compute import utils as compute_utils 

18from nova.conductor.tasks import base 

19from nova.conductor.tasks import cross_cell_migrate 

20from nova import exception 

21from nova.i18n import _ 

22from nova import objects 

23from nova.scheduler.client import report 

24from nova.scheduler import utils as scheduler_utils 

25 

26LOG = logging.getLogger(__name__) 

27 

28 

29def replace_allocation_with_migration(context, instance, migration): 

30 """Replace instance's allocation with one for a migration. 

31 

32 :raises: keystoneauth1.exceptions.base.ClientException on failure to 

33 communicate with the placement API 

34 :raises: ConsumerAllocationRetrievalFailed if reading the current 

35 allocation from placement fails 

36 :raises: ComputeHostNotFound if the host of the instance is not found in 

37 the database 

38 :raises: AllocationMoveFailed if moving the allocation from the 

39 instance.uuid to the migration.uuid fails due to parallel 

40 placement operation on the instance consumer 

41 :raises: NoValidHost if placement rejects the update for other reasons 

42 (e.g. not enough resources) 

43 :returns: (source_compute_node, migration_allocation) 

44 """ 

45 try: 

46 source_cn = objects.ComputeNode.get_by_host_and_nodename( 

47 context, instance.host, instance.node) 

48 except exception.ComputeHostNotFound: 

49 LOG.error('Unable to find record for source ' 

50 'node %(node)s on %(host)s', 

51 {'host': instance.host, 'node': instance.node}, 

52 instance=instance) 

53 # A generic error like this will just error out the migration 

54 # and do any rollback required 

55 raise 

56 

57 reportclient = report.report_client_singleton() 

58 

59 orig_alloc = reportclient.get_allocs_for_consumer( 

60 context, instance.uuid)['allocations'] 

61 root_alloc = orig_alloc.get(source_cn.uuid, {}).get('resources', {}) 

62 if not root_alloc: 

63 # TODO(stephenfin): This was a valid code path when there was support 

64 # for multiple schedulers, but it should probably be an error now 

65 LOG.debug( 

66 'Unable to find existing allocations for instance on ' 

67 'source compute node: %s', 

68 source_cn.uuid, instance=instance) 

69 return None, None 

70 

71 # FIXME(gibi): This method is flawed in that it does not handle allocations 

72 # against sharing providers in any special way. This leads to duplicate 

73 # allocations against the sharing provider during migration. 

74 success = reportclient.move_allocations(context, instance.uuid, 

75 migration.uuid) 

76 if not success: 

77 LOG.error('Unable to replace resource claim on source ' 

78 'host %(host)s node %(node)s for instance', 

79 {'host': instance.host, 

80 'node': instance.node}, 

81 instance=instance) 

82 # Mimic the "no space" error that could have come from the 

83 # scheduler. Once we have an atomic replace operation, this 

84 # would be a severe error. 

85 raise exception.NoValidHost( 

86 reason=_('Unable to replace instance claim on source')) 

87 else: 

88 LOG.debug('Created allocations for migration %(mig)s on %(rp)s', 

89 {'mig': migration.uuid, 'rp': source_cn.uuid}) 

90 

91 return source_cn, orig_alloc 

92 

93 

94def revert_allocation_for_migration(context, source_cn, instance, migration): 

95 """Revert an allocation made for a migration back to the instance.""" 

96 

97 reportclient = report.report_client_singleton() 

98 

99 # FIXME(gibi): This method is flawed in that it does not handle allocations 

100 # against sharing providers in any special way. This leads to duplicate 

101 # allocations against the sharing provider during migration. 

102 success = reportclient.move_allocations(context, migration.uuid, 

103 instance.uuid) 

104 if not success: 

105 LOG.error('Unable to replace resource claim on source ' 

106 'host %(host)s node %(node)s for instance', 

107 {'host': instance.host, 

108 'node': instance.node}, 

109 instance=instance) 

110 else: 

111 LOG.debug('Created allocations for instance %(inst)s on %(rp)s', 

112 {'inst': instance.uuid, 'rp': source_cn.uuid}) 

113 

114 

115class MigrationTask(base.TaskBase): 

116 def __init__(self, context, instance, flavor, 

117 request_spec, clean_shutdown, compute_rpcapi, 

118 query_client, report_client, host_list, network_api): 

119 super(MigrationTask, self).__init__(context, instance) 

120 self.clean_shutdown = clean_shutdown 

121 self.request_spec = request_spec 

122 self.flavor = flavor 

123 

124 self.compute_rpcapi = compute_rpcapi 

125 self.query_client = query_client 

126 self.reportclient = report_client 

127 self.host_list = host_list 

128 self.network_api = network_api 

129 

130 # Persist things from the happy path so we don't have to look 

131 # them up if we need to roll back 

132 self._migration = None 

133 self._held_allocations = None 

134 self._source_cn = None 

135 

136 def _preallocate_migration(self): 

137 # If this is a rescheduled migration, don't create a new record. 

138 migration_type = ("resize" if self.instance.flavor.id != self.flavor.id 

139 else "migration") 

140 filters = {"instance_uuid": self.instance.uuid, 

141 "migration_type": migration_type, 

142 "status": "pre-migrating"} 

143 migrations = objects.MigrationList.get_by_filters(self.context, 

144 filters).objects 

145 if migrations: 145 ↛ 146line 145 didn't jump to line 146 because the condition on line 145 was never true

146 migration = migrations[0] 

147 else: 

148 migration = objects.Migration(context=self.context.elevated()) 

149 migration.old_instance_type_id = self.instance.flavor.id 

150 migration.new_instance_type_id = self.flavor.id 

151 migration.status = 'pre-migrating' 

152 migration.instance_uuid = self.instance.uuid 

153 migration.source_compute = self.instance.host 

154 migration.source_node = self.instance.node 

155 migration.migration_type = migration_type 

156 migration.create() 

157 

158 self._migration = migration 

159 

160 self._source_cn, self._held_allocations = ( 

161 replace_allocation_with_migration(self.context, 

162 self.instance, 

163 self._migration)) 

164 

165 return migration 

166 

167 def _set_requested_destination_cell(self, legacy_props): 

168 instance_mapping = objects.InstanceMapping.get_by_instance_uuid( 

169 self.context, self.instance.uuid) 

170 if not ('requested_destination' in self.request_spec and 

171 self.request_spec.requested_destination): 

172 self.request_spec.requested_destination = objects.Destination() 

173 targeted = 'host' in self.request_spec.requested_destination 

174 # NOTE(mriedem): If the user is allowed to perform a cross-cell resize 

175 # then add the current cell to the request spec as "preferred" so the 

176 # scheduler will (by default) weigh hosts within the current cell over 

177 # hosts in another cell, all other things being equal. If the user is 

178 # not allowed to perform cross-cell resize, then we limit the request 

179 # spec and tell the scheduler to only look at hosts in the current 

180 # cell. 

181 cross_cell_allowed = ( 

182 self.request_spec.requested_destination.allow_cross_cell_move) 

183 if targeted and cross_cell_allowed: 

184 # If a target host is specified it might be in another cell so 

185 # we cannot restrict the cell in this case. We would not prefer 

186 # the source cell in that case either since we know where the 

187 # user wants it to go. We just let the scheduler figure it out. 

188 self.request_spec.requested_destination.cell = None 

189 else: 

190 self.request_spec.requested_destination.cell = ( 

191 instance_mapping.cell_mapping) 

192 

193 # NOTE(takashin): In the case that the target host is specified, 

194 # if the migration is failed, it is not necessary to retry 

195 # the cold migration to the same host. So make sure that 

196 # reschedule will not occur. 

197 if targeted: 

198 legacy_props.pop('retry', None) 

199 self.request_spec.retry = None 

200 

201 # Log our plan before calling the scheduler. 

202 if cross_cell_allowed and targeted: 

203 LOG.debug('Not restricting cell for targeted cold migration.', 

204 instance=self.instance) 

205 elif cross_cell_allowed: 

206 LOG.debug('Allowing migration from cell %(cell)s', 

207 {'cell': instance_mapping.cell_mapping.identity}, 

208 instance=self.instance) 

209 else: 

210 LOG.debug('Restricting to cell %(cell)s while migrating', 

211 {'cell': instance_mapping.cell_mapping.identity}, 

212 instance=self.instance) 

213 

214 def _is_selected_host_in_source_cell(self, selection): 

215 """Checks if the given Selection is in the same cell as the instance 

216 

217 :param selection: Selection object returned from the scheduler 

218 ``select_destinations`` method. 

219 :returns: True if the host Selection is in the same cell as the 

220 instance, False otherwise. 

221 """ 

222 # Note that the context is already targeted to the current cell in 

223 # which the instance exists. 

224 same_cell = selection.cell_uuid == self.context.cell_uuid 

225 if not same_cell: 

226 LOG.debug('Selected target host %s is in cell %s and instance is ' 

227 'in cell: %s', selection.service_host, 

228 selection.cell_uuid, self.context.cell_uuid, 

229 instance=self.instance) 

230 return same_cell 

231 

232 def _execute(self): 

233 # NOTE(sbauza): Force_hosts/nodes needs to be reset if we want to make 

234 # sure that the next destination is not forced to be the original host. 

235 # This needs to be done before the populate_retry call otherwise 

236 # retries will be disabled if the server was created with a forced 

237 # host/node. 

238 self.request_spec.reset_forced_destinations() 

239 

240 # TODO(sbauza): Remove once all the scheduler.utils methods accept a 

241 # RequestSpec object in the signature. 

242 legacy_props = self.request_spec.to_legacy_filter_properties_dict() 

243 scheduler_utils.setup_instance_group(self.context, self.request_spec) 

244 # If a target host is set in a requested destination, 

245 # 'populate_retry' need not be executed. 

246 if not ('requested_destination' in self.request_spec and 

247 self.request_spec.requested_destination and 

248 'host' in self.request_spec.requested_destination): 

249 scheduler_utils.populate_retry(legacy_props, 

250 self.instance.uuid) 

251 

252 port_res_req, req_lvl_params = ( 

253 self.network_api.get_requested_resource_for_instance( 

254 self.context, self.instance.uuid) 

255 ) 

256 # NOTE(gibi): When cyborg or other module wants to handle similar 

257 # non-nova resources then here we have to collect all the external 

258 # resource requests in a single list and add them to the RequestSpec. 

259 self.request_spec.requested_resources = port_res_req 

260 self.request_spec.request_level_params = req_lvl_params 

261 # NOTE(gibi): as PCI devices is tracked in placement we need to 

262 # generate request groups from InstancePCIRequests. This will append 

263 # new RequestGroup objects to the request_spec.requested_resources list 

264 # if needed 

265 self.request_spec.generate_request_groups_from_pci_requests() 

266 

267 self._set_requested_destination_cell(legacy_props) 

268 

269 # Once _preallocate_migration() is done, the source node allocation is 

270 # moved from the instance consumer to the migration record consumer, 

271 # and the instance consumer doesn't have any allocations. If this is 

272 # the first time through here (not a reschedule), select_destinations 

273 # below will allocate resources on the selected destination node for 

274 # the instance consumer. If we're rescheduling, host_list is not None 

275 # and we'll call claim_resources for the instance and the selected 

276 # alternate. If we exhaust our alternates and raise MaxRetriesExceeded, 

277 # the rollback() method should revert the allocation swaparoo and move 

278 # the source node allocation from the migration record back to the 

279 # instance record. 

280 migration = self._preallocate_migration() 

281 

282 self.request_spec.ensure_project_and_user_id(self.instance) 

283 self.request_spec.ensure_network_information(self.instance) 

284 compute_utils.heal_reqspec_is_bfv( 

285 self.context, self.request_spec, self.instance) 

286 # On an initial call to migrate, 'self.host_list' will be None, so we 

287 # have to call the scheduler to get a list of acceptable hosts to 

288 # migrate to. That list will consist of a selected host, along with 

289 # zero or more alternates. On a reschedule, though, the alternates will 

290 # be passed to this object and stored in 'self.host_list', so we can 

291 # pop the first alternate from the list to use for the destination, and 

292 # pass the remaining alternates to the compute. 

293 if self.host_list is None: 293 ↛ 310line 293 didn't jump to line 310 because the condition on line 293 was always true

294 selection = self._schedule() 

295 if not self._is_selected_host_in_source_cell(selection): 

296 # If the selected host is in another cell, we need to execute 

297 # another task to do the cross-cell migration. 

298 LOG.info('Executing cross-cell resize task starting with ' 

299 'target host: %s', selection.service_host, 

300 instance=self.instance) 

301 task = cross_cell_migrate.CrossCellMigrationTask( 

302 self.context, self.instance, self.flavor, 

303 self.request_spec, self._migration, self.compute_rpcapi, 

304 selection, self.host_list) 

305 task.execute() 

306 return 

307 else: 

308 # This is a reschedule that will use the supplied alternate hosts 

309 # in the host_list as destinations. 

310 selection = self._reschedule() 

311 

312 scheduler_utils.populate_filter_properties(legacy_props, selection) 

313 

314 (host, node) = (selection.service_host, selection.nodename) 

315 

316 # The availability_zone field was added in v1.1 of the Selection 

317 # object so make sure to handle the case where it is missing. 

318 if 'availability_zone' in selection: 318 ↛ 319line 318 didn't jump to line 319 because the condition on line 318 was never true

319 self.instance.availability_zone = selection.availability_zone 

320 else: 

321 self.instance.availability_zone = ( 

322 availability_zones.get_host_availability_zone( 

323 self.context, host)) 

324 

325 LOG.debug("Calling prep_resize with selected host: %s; " 

326 "Selected node: %s; Alternates: %s", host, node, 

327 self.host_list, instance=self.instance) 

328 # RPC cast to the destination host to start the migration process. 

329 self.compute_rpcapi.prep_resize( 

330 # NOTE(mriedem): Using request_spec.image here is potentially 

331 # dangerous if it is not kept up to date (i.e. rebuild/unshelve); 

332 # seems like the sane thing to do would be to pass the current 

333 # instance.image_meta since that is what MoveClaim will use for 

334 # any NUMA topology claims on the destination host... 

335 self.context, self.instance, self.request_spec.image, 

336 self.flavor, host, migration, 

337 request_spec=self.request_spec, filter_properties=legacy_props, 

338 node=node, clean_shutdown=self.clean_shutdown, 

339 host_list=self.host_list) 

340 

341 def _schedule(self): 

342 selection_lists = self.query_client.select_destinations( 

343 self.context, self.request_spec, [self.instance.uuid], 

344 return_objects=True, return_alternates=True) 

345 # Since there is only ever one instance to migrate per call, we 

346 # just need the first returned element. 

347 selection_list = selection_lists[0] 

348 

349 # Scheduler allocated resources on the first host so try that first 

350 selection, self.host_list = selection_list[0], selection_list[1:] 

351 

352 scheduler_utils.fill_provider_mapping(self.request_spec, selection) 

353 return selection 

354 

355 def _reschedule(self): 

356 # Since the resources on these alternates may have been consumed and 

357 # might not be able to support the migrated instance, we need to first 

358 # claim the resources to verify the host still has sufficient 

359 # available resources. 

360 elevated = self.context.elevated() 

361 host_available = False 

362 selection = None 

363 while self.host_list and not host_available: 

364 selection = self.host_list.pop(0) 

365 if selection.allocation_request: 365 ↛ 368line 365 didn't jump to line 368 because the condition on line 365 was always true

366 alloc_req = jsonutils.loads(selection.allocation_request) 

367 else: 

368 alloc_req = None 

369 if alloc_req: 369 ↛ 385line 369 didn't jump to line 385 because the condition on line 369 was always true

370 # If this call succeeds, the resources on the destination 

371 # host will be claimed by the instance. 

372 host_available = scheduler_utils.claim_resources( 

373 elevated, self.reportclient, self.request_spec, 

374 self.instance.uuid, alloc_req, 

375 selection.allocation_request_version) 

376 if host_available: 

377 scheduler_utils.fill_provider_mapping( 

378 self.request_spec, selection) 

379 else: 

380 # Some deployments use different schedulers that do not 

381 # use Placement, so they will not have an 

382 # allocation_request to claim with. For those cases, 

383 # there is no concept of claiming, so just assume that 

384 # the host is valid. 

385 host_available = True 

386 # There are no more available hosts. Raise a MaxRetriesExceeded 

387 # exception in that case. 

388 if not host_available: 

389 reason = ("Exhausted all hosts available for retrying build " 

390 "failures for instance %(instance_uuid)s." % 

391 {"instance_uuid": self.instance.uuid}) 

392 raise exception.MaxRetriesExceeded(reason=reason) 

393 return selection 

394 

395 def rollback(self, ex): 

396 if self._migration: 396 ↛ 400line 396 didn't jump to line 400 because the condition on line 396 was always true

397 self._migration.status = 'error' 

398 self._migration.save() 

399 

400 if not self._held_allocations: 400 ↛ 401line 400 didn't jump to line 401 because the condition on line 400 was never true

401 return 

402 

403 # NOTE(danms): We created new-style migration-based 

404 # allocations for the instance, but failed before we kicked 

405 # off the migration in the compute. Normally the latter would 

406 # do that cleanup but we never got that far, so do it here and 

407 # now. 

408 

409 revert_allocation_for_migration(self.context, self._source_cn, 

410 self.instance, self._migration)