Coverage for nova/pci/stats.py: 95%

357 statements  

« prev     ^ index     » next       coverage.py v7.6.12, created at 2025-04-24 11:16 +0000

1# Copyright (c) 2013 Intel, Inc. 

2# Copyright (c) 2013 OpenStack Foundation 

3# All Rights Reserved. 

4# 

5# Licensed under the Apache License, Version 2.0 (the "License"); you may 

6# not use this file except in compliance with the License. You may obtain 

7# a copy of the License at 

8# 

9# http://www.apache.org/licenses/LICENSE-2.0 

10# 

11# Unless required by applicable law or agreed to in writing, software 

12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT 

13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the 

14# License for the specific language governing permissions and limitations 

15# under the License. 

16import collections 

17import copy 

18import typing as ty 

19 

20from oslo_config import cfg 

21from oslo_log import log as logging 

22from oslo_utils import strutils 

23 

24from nova import exception 

25from nova import objects 

26from nova.objects import fields 

27from nova.objects import pci_device_pool 

28from nova.pci.request import PCI_REMOTE_MANAGED_TAG 

29from nova.pci import utils 

30from nova.pci import whitelist 

31 

32CONF = cfg.CONF 

33LOG = logging.getLogger(__name__) 

34 

35 

36# TODO(stephenfin): We might want to use TypedDict here. Refer to 

37# https://mypy.readthedocs.io/en/latest/kinds_of_types.html#typeddict for 

38# more information. 

39Pool = ty.Dict[str, ty.Any] 

40 

41 

42class PciDeviceStats(object): 

43 

44 """PCI devices summary information. 

45 

46 According to the PCI SR-IOV spec, a PCI physical function can have up to 

47 256 PCI virtual functions, thus the number of assignable PCI functions in 

48 a cloud can be big. The scheduler needs to know all device availability 

49 information in order to determine which compute hosts can support a PCI 

50 request. Passing individual virtual device information to the scheduler 

51 does not scale, so we provide summary information. 

52 

53 Usually the virtual functions provided by a host PCI device have the same 

54 value for most properties, like vendor_id, product_id and class type. 

55 The PCI stats class summarizes this information for the scheduler. 

56 

57 The pci stats information is maintained exclusively by compute node 

58 resource tracker and updated to database. The scheduler fetches the 

59 information and selects the compute node accordingly. If a compute 

60 node is selected, the resource tracker allocates the devices to the 

61 instance and updates the pci stats information. 

62 

63 This summary information will be helpful for cloud management also. 

64 """ 

65 

66 pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type'] 

67 # these can be specified in the [pci]device_spec and can be requested via 

68 # the PCI alias, but they are matched by the placement 

69 # allocation_candidates query, so we can ignore them during pool creation 

70 # and during filtering here 

71 ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits'] 

72 # this is a metadata key in the spec that is matched 

73 # specially in _filter_pools_based_on_placement_allocation. So we can 

74 # ignore them in the general matching logic. 

75 ignored_spec_tags += ['rp_uuids'] 

76 # this is a metadata key in the pool that is matched 

77 # specially in _filter_pools_based_on_placement_allocation. So we can 

78 # ignore them in the general matching logic. 

79 ignored_pool_tags += ['rp_uuid'] 

80 

81 def __init__( 

82 self, 

83 numa_topology: 'objects.NUMATopology', 

84 stats: 'objects.PCIDevicePoolList' = None, 

85 dev_filter: ty.Optional[whitelist.Whitelist] = None, 

86 ) -> None: 

87 self.numa_topology = numa_topology 

88 self.pools = ( 

89 [pci_pool.to_dict() for pci_pool in stats] if stats else [] 

90 ) 

91 self.pools.sort(key=lambda item: len(item)) 

92 self.dev_filter = dev_filter or whitelist.Whitelist( 

93 CONF.pci.device_spec) 

94 

95 def _equal_properties( 

96 self, dev: Pool, entry: Pool, matching_keys: ty.List[str], 

97 ) -> bool: 

98 return all(dev.get(prop) == entry.get(prop) 

99 for prop in matching_keys) 

100 

101 def _find_pool(self, dev_pool: Pool) -> ty.Optional[Pool]: 

102 """Return the first pool that matches dev.""" 

103 for pool in self.pools: 

104 pool_keys = pool.copy() 

105 del pool_keys['count'] 

106 del pool_keys['devices'] 

107 for tag in self.ignored_pool_tags: 

108 pool_keys.pop(tag, None) 

109 

110 if (len(pool_keys.keys()) == len(dev_pool.keys()) and 

111 self._equal_properties(dev_pool, pool_keys, list(dev_pool))): 

112 return pool 

113 

114 return None 

115 

116 @staticmethod 

117 def _ensure_remote_managed_tag( 

118 dev: 'objects.PciDevice', pool: Pool): 

119 """Add a remote_managed tag depending on a device type if needed. 

120 

121 Network devices may be managed remotely, e.g. by a SmartNIC DPU. If 

122 a tag has not been explicitly provided, populate it by assuming that 

123 a device is not remote managed by default. 

124 """ 

125 if dev.dev_type not in (fields.PciDeviceType.SRIOV_VF, 

126 fields.PciDeviceType.SRIOV_PF, 

127 fields.PciDeviceType.VDPA): 

128 return 

129 

130 # A tag is added here rather than at the client side to avoid an 

131 # issue with having objects without this tag specified during an 

132 # upgrade to the first version that supports handling this tag. 

133 if pool.get(PCI_REMOTE_MANAGED_TAG) is None: 

134 # NOTE: tags are compared as strings case-insensitively, see 

135 # pci_device_prop_match in nova/pci/utils.py. 

136 pool[PCI_REMOTE_MANAGED_TAG] = 'false' 

137 

138 def _create_pool_keys_from_dev( 

139 self, dev: 'objects.PciDevice', 

140 ) -> ty.Optional[Pool]: 

141 """Create a stats pool dict that this dev is supposed to be part of 

142 

143 Note that this pool dict contains the stats pool's keys and their 

144 values. 'count' and 'devices' are not included. 

145 """ 

146 # Don't add a device that doesn't have a matching device spec. 

147 # This can happen during initial sync up with the controller 

148 devspec = self.dev_filter.get_devspec(dev) 

149 if not devspec: 

150 return None 

151 tags = devspec.get_tags() 

152 pool = {k: getattr(dev, k) for k in self.pool_keys} 

153 

154 if tags: 

155 pool.update( 

156 { 

157 k: v 

158 for k, v in tags.items() 

159 if k not in self.ignored_pool_tags 

160 } 

161 ) 

162 # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a 

163 # single RP and the scheduler allocates from a specific RP we need 

164 # to split the pools by PCI or PF address. We can still keep 

165 # the VFs from the same parent PF in a single pool though as they 

166 # are equivalent from placement perspective. 

167 pool['address'] = dev.parent_addr or dev.address 

168 

169 # NOTE(gibi): parent_ifname acts like a tag during pci claim but 

170 # not provided as part of the whitelist spec as it is auto detected 

171 # by the virt driver. 

172 # This key is used for match InstancePciRequest backed by neutron ports 

173 # that has resource_request and therefore that has resource allocation 

174 # already in placement. 

175 if dev.extra_info.get('parent_ifname'): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 pool['parent_ifname'] = dev.extra_info['parent_ifname'] 

177 

178 self._ensure_remote_managed_tag(dev, pool) 

179 

180 return pool 

181 

182 def _get_pool_with_device_type_mismatch( 

183 self, dev: 'objects.PciDevice', 

184 ) -> ty.Optional[ty.Tuple[Pool, 'objects.PciDevice']]: 

185 """Check for device type mismatch in the pools for a given device. 

186 

187 Return (pool, device) if device type does not match or a single None 

188 if the device type matches. 

189 """ 

190 for pool in self.pools: 

191 for device in pool['devices']: 

192 if device.address == dev.address: 

193 if dev.dev_type != pool["dev_type"]: 

194 return pool, device 

195 return None 

196 

197 return None 

198 

199 def update_device(self, dev: 'objects.PciDevice') -> None: 

200 """Update a device to its matching pool.""" 

201 pool_device_info = self._get_pool_with_device_type_mismatch(dev) 

202 if pool_device_info is None: 

203 return None 

204 

205 pool, device = pool_device_info 

206 pool['devices'].remove(device) 

207 self._decrease_pool_count(self.pools, pool) 

208 self.add_device(dev) 

209 

210 def add_device(self, dev: 'objects.PciDevice') -> None: 

211 """Add a device to its matching pool.""" 

212 dev_pool = self._create_pool_keys_from_dev(dev) 

213 if dev_pool: 

214 pool = self._find_pool(dev_pool) 

215 if not pool: 

216 dev_pool['count'] = 0 

217 dev_pool['devices'] = [] 

218 self.pools.append(dev_pool) 

219 self.pools.sort(key=lambda item: len(item)) 

220 pool = dev_pool 

221 pool['count'] += 1 

222 pool['devices'].append(dev) 

223 

224 @staticmethod 

225 def _decrease_pool_count( 

226 pool_list: ty.List[Pool], pool: Pool, count: int = 1, 

227 ) -> int: 

228 """Decrement pool's size by count. 

229 

230 If pool becomes empty, remove pool from pool_list. 

231 """ 

232 if pool['count'] > count: 

233 pool['count'] -= count 

234 count = 0 

235 else: 

236 count -= pool['count'] 

237 pool_list.remove(pool) 

238 return count 

239 

240 def remove_device(self, dev: 'objects.PciDevice') -> None: 

241 """Remove one device from the first pool that it matches.""" 

242 dev_pool = self._create_pool_keys_from_dev(dev) 

243 if dev_pool: 

244 pool = self._find_pool(dev_pool) 

245 if not pool: 

246 raise exception.PciDevicePoolEmpty( 

247 compute_node_id=dev.compute_node_id, address=dev.address) 

248 pool['devices'].remove(dev) 

249 self._decrease_pool_count(self.pools, pool) 

250 

251 def get_free_devs(self) -> ty.List['objects.PciDevice']: 

252 free_devs: ty.List[objects.PciDevice] = [] 

253 for pool in self.pools: 

254 free_devs.extend(pool['devices']) 

255 return free_devs 

256 

257 def _allocate_devs( 

258 self, pool: Pool, num: int, request_id: str 

259 ) -> ty.List["objects.PciDevice"]: 

260 alloc_devices = [] 

261 for _ in range(num): 

262 pci_dev = pool['devices'].pop() 

263 self._handle_device_dependents(pci_dev) 

264 pci_dev.request_id = request_id 

265 alloc_devices.append(pci_dev) 

266 return alloc_devices 

267 

268 def consume_requests( 

269 self, 

270 pci_requests: 'objects.InstancePCIRequests', 

271 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, 

272 ) -> ty.Optional[ty.List['objects.PciDevice']]: 

273 

274 alloc_devices: ty.List[objects.PciDevice] = [] 

275 

276 for request in pci_requests: 

277 count = request.count 

278 

279 rp_uuids = self._get_rp_uuids_for_request( 

280 request=request, provider_mapping=None) 

281 pools = self._filter_pools( 

282 self.pools, request, numa_cells, rp_uuids=rp_uuids) 

283 

284 # Failed to allocate the required number of devices. Return the 

285 # devices already allocated during previous iterations back to 

286 # their pools 

287 if not pools: 

288 LOG.error("Failed to allocate PCI devices for instance. " 

289 "Unassigning devices back to pools. " 

290 "This should not happen, since the scheduler " 

291 "should have accurate information, and allocation " 

292 "during claims is controlled via a hold " 

293 "on the compute node semaphore.") 

294 for d in range(len(alloc_devices)): 

295 self.add_device(alloc_devices.pop()) 

296 raise exception.PciDeviceRequestFailed(requests=pci_requests) 

297 

298 if not rp_uuids: 

299 # if there is no placement allocation then we are free to 

300 # consume from the pools in any order: 

301 for pool in pools: 301 ↛ 276line 301 didn't jump to line 276 because the loop on line 301 didn't complete

302 if pool['count'] >= count: 

303 num_alloc = count 

304 else: 

305 num_alloc = pool['count'] 

306 count -= num_alloc 

307 pool['count'] -= num_alloc 

308 alloc_devices += self._allocate_devs( 

309 pool, num_alloc, request.request_id) 

310 if count == 0: 

311 break 

312 else: 

313 # but if there is placement allocation then we have to follow 

314 # it 

315 

316 if not self._assert_one_pool_per_rp_uuid(pools): 

317 raise exception.PciDeviceRequestFailed( 

318 requests=pci_requests) 

319 

320 requested_devs_per_pool_rp = collections.Counter(rp_uuids) 

321 for pool in pools: 

322 count = requested_devs_per_pool_rp[pool['rp_uuid']] 

323 pool['count'] -= count 

324 alloc_devices += self._allocate_devs( 

325 pool, count, request.request_id) 

326 # we consumed all the requested devices for the rp_uuid 

327 # so we can drop that rp_uuid from the request. 

328 requested_devs_per_pool_rp.pop(pool['rp_uuid'], None) 

329 

330 return alloc_devices 

331 

332 def _handle_device_dependents(self, pci_dev: 'objects.PciDevice') -> None: 

333 """Remove device dependents or a parent from pools. 

334 

335 In case the device is a PF, all of it's dependent VFs should 

336 be removed from pools count, if these are present. 

337 When the device is a VF, or a VDPA device, it's parent PF 

338 pool count should be decreased, unless it is no longer in a pool. 

339 """ 

340 if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF: 

341 vfs_list = pci_dev.child_devices 

342 if vfs_list: 

343 free_devs = self.get_free_devs() 

344 for vf in vfs_list: 

345 # NOTE(gibi): do not try to remove a device that are 

346 # already removed 

347 if vf in free_devs: 

348 self.remove_device(vf) 

349 elif pci_dev.dev_type in ( 

350 fields.PciDeviceType.SRIOV_VF, 

351 fields.PciDeviceType.VDPA, 

352 ): 

353 try: 

354 parent = pci_dev.parent_device 

355 # Make sure not to decrease PF pool count if this parent has 

356 # been already removed from pools 

357 if parent in self.get_free_devs(): 

358 self.remove_device(parent) 

359 except exception.PciDeviceNotFound: 

360 return 

361 

362 def _filter_pools_for_spec( 

363 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', 

364 ) -> ty.List[Pool]: 

365 """Filter out pools that don't match the request's device spec. 

366 

367 Exclude pools that do not match the specified ``vendor_id``, 

368 ``product_id`` and/or ``device_type`` field, or any of the other 

369 arbitrary tags such as ``physical_network``, specified in the request. 

370 

371 :param pools: A list of PCI device pool dicts 

372 :param request: An InstancePCIRequest object describing the type, 

373 quantity and required NUMA affinity of device(s) we want. 

374 :returns: A list of pools that can be used to support the request if 

375 this is possible. 

376 """ 

377 

378 def ignore_keys(spec): 

379 return { 

380 k: v 

381 for k, v in spec.items() 

382 if k not in self.ignored_spec_tags 

383 } 

384 

385 request_specs = [ignore_keys(spec) for spec in request.spec] 

386 return [ 

387 pool for pool in pools 

388 if utils.pci_device_prop_match(pool, request_specs) 

389 ] 

390 

391 def _filter_pools_for_numa_cells( 

392 self, 

393 pools: ty.List[Pool], 

394 request: 'objects.InstancePCIRequest', 

395 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']], 

396 ) -> ty.List[Pool]: 

397 """Filter out pools with the wrong NUMA affinity, if required. 

398 

399 Exclude pools that do not have *suitable* PCI NUMA affinity. 

400 ``numa_policy`` determines what *suitable* means, being one of 

401 PREFERRED (nice-to-have), LEGACY (must-have-if-available) and REQUIRED 

402 (must-have). We iterate through the various policies in order of 

403 strictness. This means that even if we only *prefer* PCI-NUMA affinity, 

404 we will still attempt to provide it if possible. 

405 

406 :param pools: A list of PCI device pool dicts 

407 :param request: An InstancePCIRequest object describing the type, 

408 quantity and required NUMA affinity of device(s) we want. 

409 :param numa_cells: A list of InstanceNUMACell objects whose ``id`` 

410 corresponds to the ``id`` of host NUMACells. 

411 :returns: A list of pools that can, together, provide at least 

412 ``requested_count`` PCI devices with the level of NUMA affinity 

413 required by ``numa_policy``, else all pools that can satisfy this 

414 policy even if it's not enough. 

415 """ 

416 if not numa_cells: 

417 return pools 

418 

419 # we default to the 'legacy' policy for...of course...legacy reasons 

420 requested_policy = fields.PCINUMAAffinityPolicy.LEGACY 

421 if 'numa_policy' in request: 

422 requested_policy = request.numa_policy or requested_policy 

423 

424 requested_count = request.count 

425 numa_cell_ids = [cell.id for cell in numa_cells] 

426 

427 # filter out pools which numa_node is not included in numa_cell_ids 

428 filtered_pools = [ 

429 pool for pool in pools if any(utils.pci_device_prop_match( 

430 pool, [{'numa_node': cell}]) for cell in numa_cell_ids)] 

431 

432 # we can't apply a less strict policy than the one requested, so we 

433 # need to return if we've demanded a NUMA affinity of REQUIRED. 

434 # However, NUMA affinity is a good thing. If we can get enough devices 

435 # with the stricter policy then we will use them. 

436 if requested_policy == fields.PCINUMAAffinityPolicy.REQUIRED or sum( 

437 pool['count'] for pool in filtered_pools) >= requested_count: 

438 return filtered_pools 

439 

440 # the SOCKET policy is a bit of a special case. It's less strict than 

441 # REQUIRED (so REQUIRED will automatically fulfil SOCKET, at least 

442 # with our assumption of never having multiple sockets per NUMA node), 

443 # but not always more strict than LEGACY: a PCI device with no NUMA 

444 # affinity will fulfil LEGACY but not SOCKET. If we have SOCKET, 

445 # process it here and don't continue. 

446 if requested_policy == fields.PCINUMAAffinityPolicy.SOCKET: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true

447 return self._filter_pools_for_socket_affinity(pools, numa_cells) 

448 

449 # some systems don't report NUMA node info for PCI devices, in which 

450 # case None is reported in 'pci_device.numa_node'. The LEGACY policy 

451 # allows us to use these devices so we include None in the list of 

452 # suitable NUMA cells. 

453 numa_cell_ids.append(None) 

454 

455 # filter out pools which numa_node is not included in numa_cell_ids 

456 filtered_pools = [ 

457 pool for pool in pools if any(utils.pci_device_prop_match( 

458 pool, [{'numa_node': cell}]) for cell in numa_cell_ids)] 

459 

460 # once again, we can't apply a less strict policy than the one 

461 # requested, so we need to return if we've demanded a NUMA affinity of 

462 # LEGACY. Similarly, we will also return if we have enough devices to 

463 # satisfy this somewhat strict policy. 

464 if requested_policy == fields.PCINUMAAffinityPolicy.LEGACY or sum( 

465 pool['count'] for pool in filtered_pools) >= requested_count: 

466 return filtered_pools 

467 

468 # if we've got here, we're using the PREFERRED policy and weren't able 

469 # to provide anything with stricter affinity. Use whatever devices you 

470 # can, folks. 

471 return sorted( 

472 pools, key=lambda pool: pool.get('numa_node') not in numa_cell_ids) 

473 

474 def _filter_pools_for_socket_affinity( 

475 self, 

476 pools: ty.List[Pool], 

477 numa_cells: ty.List['objects.InstanceNUMACell'], 

478 ) -> ty.List[Pool]: 

479 host_cells = self.numa_topology.cells 

480 # bail early if we don't have socket information for all host_cells. 

481 # This could happen if we're running on an weird older system with 

482 # multiple sockets per NUMA node, which is a configuration that we 

483 # explicitly chose not to support. 

484 if any(cell.socket is None for cell in host_cells): 

485 LOG.debug('No socket information in host NUMA cell(s).') 

486 return [] 

487 

488 # get a set of host sockets that the guest cells are in. Since guest 

489 # cell IDs map to host cell IDs, we can just lookup the latter's 

490 # socket. 

491 socket_ids = set() 

492 for guest_cell in numa_cells: 

493 for host_cell in host_cells: 

494 if guest_cell.id == host_cell.id: 494 ↛ 493line 494 didn't jump to line 493 because the condition on line 494 was always true

495 socket_ids.add(host_cell.socket) 

496 

497 # now get a set of host NUMA nodes that are in the above sockets 

498 allowed_numa_nodes = set() 

499 for host_cell in host_cells: 

500 if host_cell.socket in socket_ids: 500 ↛ 499line 500 didn't jump to line 499 because the condition on line 500 was always true

501 allowed_numa_nodes.add(host_cell.id) 

502 

503 # filter out pools that are not in one of the correct host NUMA nodes. 

504 return [ 

505 pool for pool in pools if any( 

506 utils.pci_device_prop_match(pool, [{'numa_node': numa_node}]) 

507 for numa_node in allowed_numa_nodes 

508 ) 

509 ] 

510 

511 def _filter_pools_for_unrequested_pfs( 

512 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', 

513 ) -> ty.List[Pool]: 

514 """Filter out pools with PFs, unless these are required. 

515 

516 This is necessary in cases where PFs and VFs have the same product_id 

517 and generally useful elsewhere. 

518 

519 :param pools: A list of PCI device pool dicts 

520 :param request: An InstancePCIRequest object describing the type, 

521 quantity and required NUMA affinity of device(s) we want. 

522 :returns: A list of pools that can be used to support the request if 

523 this is possible. 

524 """ 

525 if all( 

526 spec.get('dev_type') != fields.PciDeviceType.SRIOV_PF 

527 for spec in request.spec 

528 ): 

529 pools = [ 

530 pool for pool in pools 

531 if not pool.get('dev_type') == fields.PciDeviceType.SRIOV_PF 

532 ] 

533 return pools 

534 

535 def _filter_pools_for_unrequested_vdpa_devices( 

536 self, 

537 pools: ty.List[Pool], 

538 request: 'objects.InstancePCIRequest', 

539 ) -> ty.List[Pool]: 

540 """Filter out pools with VDPA devices, unless these are required. 

541 

542 This is necessary as vdpa devices require special handling and 

543 should not be allocated to generic pci device requests. 

544 

545 :param pools: A list of PCI device pool dicts 

546 :param request: An InstancePCIRequest object describing the type, 

547 quantity and required NUMA affinity of device(s) we want. 

548 :returns: A list of pools that can be used to support the request if 

549 this is possible. 

550 """ 

551 if all( 

552 spec.get('dev_type') != fields.PciDeviceType.VDPA 

553 for spec in request.spec 

554 ): 

555 pools = [ 

556 pool for pool in pools 

557 if not pool.get('dev_type') == fields.PciDeviceType.VDPA 

558 ] 

559 return pools 

560 

561 def _filter_pools_for_unrequested_remote_managed_devices( 

562 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', 

563 ) -> ty.List[Pool]: 

564 """Filter out pools with remote_managed devices, unless requested. 

565 

566 Remote-managed devices are not usable for legacy SR-IOV or hardware 

567 offload scenarios and must be excluded from allocation. 

568 

569 :param pools: A list of PCI device pool dicts 

570 :param request: An InstancePCIRequest object describing the type, 

571 quantity and required NUMA affinity of device(s) we want. 

572 :returns: A list of pools that can be used to support the request if 

573 this is possible. 

574 """ 

575 if all(not strutils.bool_from_string(spec.get(PCI_REMOTE_MANAGED_TAG)) 

576 for spec in request.spec): 

577 pools = [pool for pool in pools 

578 if not strutils.bool_from_string( 

579 pool.get(PCI_REMOTE_MANAGED_TAG))] 

580 return pools 

581 

582 def _filter_pools_based_on_placement_allocation( 

583 self, 

584 pools: ty.List[Pool], 

585 request: 'objects.InstancePCIRequest', 

586 rp_uuids: ty.List[str], 

587 ) -> ty.List[Pool]: 

588 if not rp_uuids: 

589 # If there is no placement allocation then we don't need to filter 

590 # by it. This could happen if the instance only has neutron port 

591 # based InstancePCIRequest as that is currently not having 

592 # placement allocation (except for QoS ports, but that handled in a 

593 # separate codepath) or if the [filter_scheduler]pci_in_placement 

594 # configuration option is not enabled in the scheduler. 

595 return pools 

596 

597 requested_dev_count_per_rp = collections.Counter(rp_uuids) 

598 matching_pools = [] 

599 for pool in pools: 

600 rp_uuid = pool.get('rp_uuid') 

601 if rp_uuid is None: 601 ↛ 609line 601 didn't jump to line 609 because the condition on line 601 was never true

602 # NOTE(gibi): As rp_uuids is not empty the scheduler allocated 

603 # PCI resources on this host, so we know that 

604 # [pci]report_in_placement is enabled on this host. But this 

605 # pool has no RP mapping which can only happen if the pool 

606 # contains PCI devices with physical_network tag, as those 

607 # devices not yet reported in placement. But if they are not 

608 # reported then we can ignore them here too. 

609 continue 

610 

611 if ( 

612 # the placement allocation contains this pool 

613 rp_uuid in requested_dev_count_per_rp and 

614 # the amount of dev allocated in placement can be consumed 

615 # from the pool 

616 pool["count"] >= requested_dev_count_per_rp[rp_uuid] 

617 ): 

618 matching_pools.append(pool) 

619 

620 return matching_pools 

621 

622 def _filter_pools_for_live_migratable_devices( 

623 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest', 

624 ) -> ty.List[Pool]: 

625 """Filter out pools with non live_migratable devices. 

626 

627 :param pools: A list of PCI device pool dicts 

628 :param request: An InstancePCIRequest object describing the type, 

629 quantity and required NUMA affinity of device(s) we want. 

630 :returns: A list of pools that can be used to support the request if 

631 this is possible. 

632 """ 

633 

634 # The following code handles the case where 'live_migratable' is 

635 # set (either "true" or "false") by filtering devices to select the 

636 # appropriate ones. 

637 # If it is not set, we skip the next code block and no filtering 

638 # is applied to the pools. 

639 if all(spec.get("live_migratable") == 'true' for spec in request.spec): 

640 #  if all specs require live migratable devices, then we need to 

641 #  reduce the pools by the ones that support them. 

642 pools = [pool for pool in pools if pool.get("live_migratable") and 

643 pool['live_migratable'] == 'true'] 

644 elif all( 

645 spec.get("live_migratable") == "false" for spec in request.spec 

646 ): 

647 #  If the request asks to NOT support live-migratable devices, then 

648 #  we don't provide the ones that support them. 

649 #  We want to exclude the devices that don't have this value yet. 

650 pools = [pool for pool in pools if pool.get("live_migratable") and 

651 pool['live_migratable'] == 'false'] 

652 return pools 

653 

654 def _filter_pools( 

655 self, 

656 pools: ty.List[Pool], 

657 request: 'objects.InstancePCIRequest', 

658 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']], 

659 rp_uuids: ty.List[str], 

660 ) -> ty.Optional[ty.List[Pool]]: 

661 """Determine if an individual PCI request can be met. 

662 

663 Filter pools, which are collections of devices with similar traits, to 

664 identify those that can support the provided PCI request. 

665 

666 If ``numa_cells`` is provided then NUMA locality may be taken into 

667 account, depending on the value of ``request.numa_policy``. 

668 

669 :param pools: A list of PCI device pool dicts 

670 :param request: An InstancePCIRequest object describing the type, 

671 quantity and required NUMA affinity of device(s) we want. 

672 :param numa_cells: A list of InstanceNUMACell objects whose ``id`` 

673 corresponds to the ``id`` of host NUMACell objects. 

674 :param rp_uuids: A list of PR uuids this request fulfilled from in 

675 placement. So here we have to consider only the pools matching with 

676 these RP uuids 

677 :returns: A list of pools that can be used to support the request if 

678 this is possible, else None. 

679 """ 

680 # NOTE(vladikr): This code may be open to race conditions. 

681 # Two concurrent requests may succeed when called support_requests 

682 # because this method does not remove related devices from the pools 

683 

684 # Firstly, let's exclude all devices that don't match our spec (e.g. 

685 # they've got different PCI IDs or something) 

686 before_count = sum([pool['count'] for pool in pools]) 

687 pools = self._filter_pools_for_spec(pools, request) 

688 after_count = sum([pool['count'] for pool in pools]) 

689 

690 if after_count < before_count: 

691 LOG.debug( 

692 'Dropped %d device(s) due to mismatched PCI attribute(s)', 

693 before_count - after_count 

694 ) 

695 

696 if after_count < request.count: 

697 LOG.debug('Not enough PCI devices left to satisfy request') 

698 return None 

699 

700 # Next, let's exclude all devices that aren't on the correct NUMA node 

701 # or socket, *assuming* we have devices and care about that, as 

702 # determined by policy 

703 before_count = after_count 

704 pools = self._filter_pools_for_numa_cells(pools, request, numa_cells) 

705 after_count = sum([pool['count'] for pool in pools]) 

706 

707 if after_count < before_count: 

708 LOG.debug( 

709 'Dropped %d device(s) as they are on the wrong NUMA node(s)', 

710 before_count - after_count 

711 ) 

712 

713 if after_count < request.count: 

714 LOG.debug('Not enough PCI devices left to satisfy request') 

715 return None 

716 

717 # If we're not requesting PFs then we should not use these. 

718 # Exclude them. 

719 before_count = after_count 

720 pools = self._filter_pools_for_unrequested_pfs(pools, request) 

721 after_count = sum([pool['count'] for pool in pools]) 

722 

723 if after_count < before_count: 

724 LOG.debug( 

725 'Dropped %d device(s) as they are PFs which we have not ' 

726 'requested', 

727 before_count - after_count 

728 ) 

729 

730 if after_count < request.count: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 LOG.debug('Not enough PCI devices left to satisfy request') 

732 return None 

733 

734 # If we're not requesting VDPA devices then we should not use these 

735 # either. Exclude them. 

736 before_count = after_count 

737 pools = self._filter_pools_for_unrequested_vdpa_devices(pools, request) 

738 after_count = sum([pool['count'] for pool in pools]) 

739 

740 if after_count < before_count: 

741 LOG.debug( 

742 'Dropped %d device(s) as they are VDPA devices which we have ' 

743 'not requested', 

744 before_count - after_count 

745 ) 

746 

747 # If we're not requesting remote_managed devices then we should not 

748 # use these either. Exclude them. 

749 before_count = after_count 

750 pools = self._filter_pools_for_unrequested_remote_managed_devices( 

751 pools, request) 

752 after_count = sum([pool['count'] for pool in pools]) 

753 

754 if after_count < before_count: 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true

755 LOG.debug( 

756 'Dropped %d device(s) as they are remote-managed devices which' 

757 'we have not requested', 

758 before_count - after_count 

759 ) 

760 

761 # if there is placement allocation for the request then we have to 

762 # remove the pools that are not in the placement allocation 

763 before_count = after_count 

764 pools = self._filter_pools_based_on_placement_allocation( 

765 pools, request, rp_uuids) 

766 after_count = sum([pool['count'] for pool in pools]) 

767 if after_count < before_count: 

768 LOG.debug( 

769 'Dropped %d device(s) that are not part of the placement ' 

770 'allocation', 

771 before_count - after_count 

772 ) 

773 

774 before_count = after_count 

775 pools = self._filter_pools_for_live_migratable_devices( 

776 pools, request) 

777 after_count = sum([pool['count'] for pool in pools]) 

778 if after_count < before_count: 778 ↛ 779line 778 didn't jump to line 779 because the condition on line 778 was never true

779 LOG.debug( 

780 'Dropped %d device(s) that are not live migratable', 

781 before_count - after_count 

782 ) 

783 

784 if after_count < request.count: 

785 LOG.debug('Not enough PCI devices left to satisfy request') 

786 return None 

787 

788 return pools 

789 

790 def support_requests( 

791 self, 

792 requests: ty.List['objects.InstancePCIRequest'], 

793 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], 

794 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, 

795 ) -> bool: 

796 """Determine if the PCI requests can be met. 

797 

798 Determine, based on a compute node's PCI stats, if an instance can be 

799 scheduled on the node. **Support does not mean real allocation**. 

800 

801 If ``numa_cells`` is provided then NUMA locality may be taken into 

802 account, depending on the value of ``numa_policy``. 

803 

804 :param requests: A list of InstancePCIRequest object describing the 

805 types, quantities and required NUMA affinities of devices we want. 

806 :type requests: nova.objects.InstancePCIRequests 

807 :param provider_mapping: A dict keyed by RequestGroup requester_id, 

808 to a list of resource provider UUIDs which provide resource 

809 for that RequestGroup. If it is None then it signals that the 

810 InstancePCIRequest objects already stores a mapping per request. 

811 I.e.: we are called _after_ the scheduler made allocations for this 

812 request in placement. 

813 :param numa_cells: A list of InstanceNUMACell objects whose ``id`` 

814 corresponds to the ``id`` of host NUMACells, or None. 

815 :returns: Whether this compute node can satisfy the given request. 

816 """ 

817 

818 # try to apply the requests on the copy of the stats if it applies 

819 # cleanly then we know that the requests is supported. We call apply 

820 # only on a copy as we don't want to actually consume resources from 

821 # the pool as at this point this is just a test during host filtering. 

822 # Later the scheduler will call apply_request to consume on the 

823 # selected host. The compute will call consume_request during PCI claim 

824 # to consume not just from the pools but also consume PciDevice 

825 # objects. 

826 stats = copy.deepcopy(self) 

827 try: 

828 stats.apply_requests(requests, provider_mapping, numa_cells) 

829 except exception.PciDeviceRequestFailed: 

830 return False 

831 

832 return True 

833 

834 def _apply_request( 

835 self, 

836 pools: ty.List[Pool], 

837 request: 'objects.InstancePCIRequest', 

838 rp_uuids: ty.List[str], 

839 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, 

840 ) -> bool: 

841 """Apply an individual PCI request. 

842 

843 Apply a PCI request against a given set of PCI device pools, which are 

844 collections of devices with similar traits. 

845 

846 If ``numa_cells`` is provided then NUMA locality may be taken into 

847 account, depending on the value of ``request.numa_policy``. 

848 

849 :param pools: A list of PCI device pool dicts 

850 :param request: An InstancePCIRequest object describing the type, 

851 quantity and required NUMA affinity of device(s) we want. 

852 :param rp_uuids: A list of PR uuids this request fulfilled from in 

853 placement 

854 :param numa_cells: A list of InstanceNUMACell objects whose ``id`` 

855 corresponds to the ``id`` of host NUMACell objects. 

856 :returns: True if the request was applied against the provided pools 

857 successfully, else False. 

858 """ 

859 # NOTE(vladikr): This code maybe open to race conditions. 

860 # Two concurrent requests may succeed when called support_requests 

861 # because this method does not remove related devices from the pools 

862 

863 filtered_pools = self._filter_pools( 

864 pools, request, numa_cells, rp_uuids) 

865 

866 if not filtered_pools: 

867 return False 

868 

869 if not rp_uuids: 

870 # If there is no placement allocation for this request then we are 

871 # free to consume from the filtered pools in any order 

872 count = request.count 

873 for pool in filtered_pools: 873 ↛ 894line 873 didn't jump to line 894 because the loop on line 873 didn't complete

874 count = self._decrease_pool_count(pools, pool, count) 

875 if not count: 

876 break 

877 else: 

878 # but if there is placement allocation then we have to follow that 

879 

880 if not self._assert_one_pool_per_rp_uuid(pools): 

881 return False 

882 

883 requested_devs_per_pool_rp = collections.Counter(rp_uuids) 

884 for pool in filtered_pools: 

885 count = requested_devs_per_pool_rp[pool['rp_uuid']] 

886 pool['count'] -= count 

887 # we consumed all the requested devices for the rp_uuid 

888 # so we can drop that rp_uuid from the request. 

889 requested_devs_per_pool_rp.pop(pool['rp_uuid'], None) 

890 

891 if pool['count'] == 0: 

892 pools.remove(pool) 

893 

894 return True 

895 

896 def _get_rp_uuids_for_request( 

897 self, 

898 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], 

899 request: 'objects.InstancePCIRequest' 

900 ) -> ty.List[str]: 

901 """Return the list of RP uuids that are fulfilling the request. 

902 

903 An RP will be in the list as many times as many devices needs to 

904 be allocated from that RP. 

905 """ 

906 

907 if request.source == objects.InstancePCIRequest.NEUTRON_PORT: 

908 # TODO(gibi): support neutron based requests in a later cycle 

909 # an empty list will signal that any PCI pool can be used for this 

910 # request 

911 return [] 

912 

913 if not provider_mapping: 

914 # NOTE(gibi): AFAIK specs is always a list of a single dict 

915 # but the object is hard to change retroactively 

916 rp_uuids = request.spec[0].get('rp_uuids') 

917 if not rp_uuids: 

918 # This can happen if [filter_scheduler]pci_in_placement is not 

919 # enabled yet 

920 # An empty list will signal that any PCI pool can be used for 

921 # this request 

922 return [] 

923 

924 # TODO(gibi): this is baaad but spec is a dict of string so 

925 # the list is serialized 

926 return rp_uuids.split(',') 

927 

928 # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from 

929 # InstancePCIRequests in the form of {request_id}-{count_index} 

930 # NOTE(gibi): a suffixed request group always fulfilled from a single 

931 # RP 

932 return [ 

933 rp_uuids[0] 

934 for group_id, rp_uuids in provider_mapping.items() 

935 if group_id.startswith(request.request_id) 

936 ] 

937 

938 def apply_requests( 

939 self, 

940 requests: ty.List['objects.InstancePCIRequest'], 

941 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]], 

942 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None, 

943 ) -> None: 

944 """Apply PCI requests to the PCI stats. 

945 

946 This is used in multiple instance creation, when the scheduler has to 

947 maintain how the resources are consumed by the instances. 

948 

949 If ``numa_cells`` is provided then NUMA locality may be taken into 

950 account, depending on the value of ``numa_policy``. 

951 

952 :param requests: A list of InstancePCIRequest object describing the 

953 types, quantities and required NUMA affinities of devices we want. 

954 :type requests: nova.objects.InstancePCIRequests 

955 :param provider_mapping: A dict keyed by RequestGroup requester_id, 

956 to a list of resource provider UUIDs which provide resource 

957 for that RequestGroup. If it is None then it signals that the 

958 InstancePCIRequest objects already stores a mapping per request. 

959 I.e.: we are called _after_ the scheduler made allocations for this 

960 request in placement. 

961 :param numa_cells: A list of InstanceNUMACell objects whose ``id`` 

962 corresponds to the ``id`` of host NUMACells, or None. 

963 :raises: exception.PciDeviceRequestFailed if this compute node cannot 

964 satisfy the given request. 

965 """ 

966 

967 for r in requests: 

968 rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r) 

969 

970 if not self._apply_request(self.pools, r, rp_uuids, numa_cells): 

971 raise exception.PciDeviceRequestFailed(requests=requests) 

972 

973 def __iter__(self) -> ty.Iterator[Pool]: 

974 pools: ty.List[Pool] = [] 

975 for pool in self.pools: 

976 pool = copy.deepcopy(pool) 

977 # 'devices' shouldn't be part of stats 

978 if 'devices' in pool: 

979 del pool['devices'] 

980 pools.append(pool) 

981 return iter(pools) 

982 

983 def clear(self) -> None: 

984 """Clear all the stats maintained.""" 

985 self.pools = [] 

986 

987 def __eq__(self, other: object) -> bool: 

988 if not isinstance(other, PciDeviceStats): 988 ↛ 989line 988 didn't jump to line 989 because the condition on line 988 was never true

989 return NotImplemented 

990 return self.pools == other.pools 

991 

992 def to_device_pools_obj(self) -> 'objects.PciDevicePoolList': 

993 """Return the contents of the pools as a PciDevicePoolList object.""" 

994 stats = [x for x in self] 

995 return pci_device_pool.from_pci_stats(stats) 

996 

997 def has_remote_managed_device_pools(self) -> bool: 

998 """Determine whether remote managed device pools are present on a host. 

999 

1000 The check is pool-based, not free device-based and is NUMA cell 

1001 agnostic. 

1002 """ 

1003 dummy_req = objects.InstancePCIRequest( 

1004 count=0, 

1005 spec=[{'remote_managed': True}] 

1006 ) 

1007 pools = self._filter_pools_for_spec(self.pools, dummy_req) 

1008 return bool(pools) 

1009 

1010 def populate_pools_metadata_from_assigned_devices(self): 

1011 """Populate the rp_uuid of each pool based on the rp_uuid of the 

1012 devices assigned to the pool. This can only be called from the compute 

1013 where devices are assigned to each pool. This should not be called from 

1014 the scheduler as there device - pool assignment is not known. 

1015 """ 

1016 # PciDevices are tracked in placement and flavor based PCI requests 

1017 # are scheduled and allocated in placement. To be able to correlate 

1018 # what is allocated in placement and what is consumed in nova we 

1019 # need to map device pools to RPs. We can do that as the PciDevice 

1020 # contains the RP UUID that represents it in placement. 

1021 # NOTE(gibi): We cannot do this when the device is originally added to 

1022 # the pool as the device -> placement translation, that creates the 

1023 # RPs, runs after all the device is created and assigned to pools. 

1024 for pool in self.pools: 

1025 pool_rps = { 

1026 dev.extra_info.get("rp_uuid") 

1027 for dev in pool["devices"] 

1028 if "rp_uuid" in dev.extra_info 

1029 } 

1030 if len(pool_rps) >= 2: 

1031 # FIXME(gibi): Do we have a 1:1 pool - RP mapping even 

1032 # if two PFs providing very similar VFs? 

1033 raise ValueError( 

1034 "We have a pool %s connected to more than one RPs %s in " 

1035 "placement via devs %s" % (pool, pool_rps, pool["devices"]) 

1036 ) 

1037 

1038 if not pool_rps: 

1039 # this can happen if the nova-compute is upgraded to have the 

1040 # PCI in placement inventory handling code but 

1041 # [pci]report_in_placement is not turned on yet. 

1042 continue 

1043 

1044 if pool_rps: # now we know that it is a single RP 1044 ↛ 1024line 1044 didn't jump to line 1024 because the condition on line 1044 was always true

1045 pool['rp_uuid'] = next(iter(pool_rps)) 

1046 

1047 @staticmethod 

1048 def _assert_one_pool_per_rp_uuid(pools: ty.List[Pool]) -> bool: 

1049 """Asserts that each pool has a unique rp_uuid if any 

1050 

1051 :param pools: A list of Pool objects. 

1052 :return: True if each pool has a unique rp_uuid or no rp_uuid assigned, 

1053 False otherwise. 

1054 """ 

1055 pools_per_rp_uuid = collections.defaultdict(list) 

1056 for pool in pools: 

1057 if "rp_uuid" in pool: 

1058 pools_per_rp_uuid[pool["rp_uuid"]].append(pool) 

1059 

1060 split_rp_uuids = { 

1061 rp_uuid: pools 

1062 for rp_uuid, pools in pools_per_rp_uuid.items() 

1063 if len(pools) > 1} 

1064 

1065 if split_rp_uuids: 

1066 LOG.warning( 

1067 "The PCI allocation logic assumes that devices " 

1068 "related to the same rp_uuid are in the same pool. " 

1069 "However the following rp_uuids are split across multiple " 

1070 "pools. This should not happen. Please file a bug report. %s", 

1071 split_rp_uuids 

1072 ) 

1073 return not split_rp_uuids