Coverage for nova/pci/stats.py: 95%

5# Licensed under the Apache License, Version 2.0 (the "License"); you may

6# not use this file except in compliance with the License. You may obtain

7# a copy of the License at

9# http://www.apache.org/licenses/LICENSE-2.0

10#

11# Unless required by applicable law or agreed to in writing, software

12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT

13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the

14# License for the specific language governing permissions and limitations

15# under the License.

16import collections

17import copy

18import typing as ty

20from oslo_config import cfg

21from oslo_log import log as logging

22from oslo_utils import strutils

24from nova import exception

25from nova import objects

26from nova.objects import fields

27from nova.objects import pci_device_pool

28from nova.pci.request import PCI_REMOTE_MANAGED_TAG

29from nova.pci import utils

30from nova.pci import whitelist

32CONF = cfg.CONF

33LOG = logging.getLogger(__name__)

36# TODO(stephenfin): We might want to use TypedDict here. Refer to

37# https://mypy.readthedocs.io/en/latest/kinds_of_types.html#typeddict for

38# more information.

39Pool = ty.Dict[str, ty.Any]

42class PciDeviceStats(object):

44 """PCI devices summary information.

46 According to the PCI SR-IOV spec, a PCI physical function can have up to

47 256 PCI virtual functions, thus the number of assignable PCI functions in

48 a cloud can be big. The scheduler needs to know all device availability

49 information in order to determine which compute hosts can support a PCI

50 request. Passing individual virtual device information to the scheduler

51 does not scale, so we provide summary information.

53 Usually the virtual functions provided by a host PCI device have the same

54 value for most properties, like vendor_id, product_id and class type.

55 The PCI stats class summarizes this information for the scheduler.

57 The pci stats information is maintained exclusively by compute node

58 resource tracker and updated to database. The scheduler fetches the

59 information and selects the compute node accordingly. If a compute

60 node is selected, the resource tracker allocates the devices to the

61 instance and updates the pci stats information.

63 This summary information will be helpful for cloud management also.

64 """

66 pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type']

67 # these can be specified in the [pci]device_spec and can be requested via

68 # the PCI alias, but they are matched by the placement

69 # allocation_candidates query, so we can ignore them during pool creation

70 # and during filtering here

71 ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits']

72 # this is a metadata key in the spec that is matched

73 # specially in _filter_pools_based_on_placement_allocation. So we can

74 # ignore them in the general matching logic.

75 ignored_spec_tags += ['rp_uuids']

76 # this is a metadata key in the pool that is matched

77 # specially in _filter_pools_based_on_placement_allocation. So we can

78 # ignore them in the general matching logic.

79 ignored_pool_tags += ['rp_uuid']

81 def __init__(

82 self,

83 numa_topology: 'objects.NUMATopology',

84 stats: 'objects.PCIDevicePoolList' = None,

85 dev_filter: ty.Optional[whitelist.Whitelist] = None,

86 ) -> None:

87 self.numa_topology = numa_topology

88 self.pools = (

89 [pci_pool.to_dict() for pci_pool in stats] if stats else []

90 )

91 self.pools.sort(key=lambda item: len(item))

92 self.dev_filter = dev_filter or whitelist.Whitelist(

93 CONF.pci.device_spec)

95 def _equal_properties(

96 self, dev: Pool, entry: Pool, matching_keys: ty.List[str],

97 ) -> bool:

98 return all(dev.get(prop) == entry.get(prop)

99 for prop in matching_keys)

100

101 def _find_pool(self, dev_pool: Pool) -> ty.Optional[Pool]:

102 """Return the first pool that matches dev."""

103 for pool in self.pools:

104 pool_keys = pool.copy()

105 del pool_keys['count']

106 del pool_keys['devices']

107 for tag in self.ignored_pool_tags:

108 pool_keys.pop(tag, None)

109

110 if (len(pool_keys.keys()) == len(dev_pool.keys()) and

111 self._equal_properties(dev_pool, pool_keys, list(dev_pool))):

112 return pool

113

114 return None

115

116 @staticmethod

117 def _ensure_remote_managed_tag(

118 dev: 'objects.PciDevice', pool: Pool):

119 """Add a remote_managed tag depending on a device type if needed.

120

121 Network devices may be managed remotely, e.g. by a SmartNIC DPU. If

122 a tag has not been explicitly provided, populate it by assuming that

123 a device is not remote managed by default.

124 """

125 if dev.dev_type not in (fields.PciDeviceType.SRIOV_VF,

126 fields.PciDeviceType.SRIOV_PF,

127 fields.PciDeviceType.VDPA):

128 return

129

130 # A tag is added here rather than at the client side to avoid an

131 # issue with having objects without this tag specified during an

132 # upgrade to the first version that supports handling this tag.

133 if pool.get(PCI_REMOTE_MANAGED_TAG) is None:

134 # NOTE: tags are compared as strings case-insensitively, see

135 # pci_device_prop_match in nova/pci/utils.py.

136 pool[PCI_REMOTE_MANAGED_TAG] = 'false'

137

138 def _create_pool_keys_from_dev(

139 self, dev: 'objects.PciDevice',

140 ) -> ty.Optional[Pool]:

141 """Create a stats pool dict that this dev is supposed to be part of

142

143 Note that this pool dict contains the stats pool's keys and their

144 values. 'count' and 'devices' are not included.

145 """

146 # Don't add a device that doesn't have a matching device spec.

147 # This can happen during initial sync up with the controller

148 devspec = self.dev_filter.get_devspec(dev)

149 if not devspec:

150 return None

151 tags = devspec.get_tags()

152 pool = {k: getattr(dev, k) for k in self.pool_keys}

153

154 if tags:

155 pool.update(

156 {

157 k: v

158 for k, v in tags.items()

159 if k not in self.ignored_pool_tags

160 }

161 )

162 # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a

163 # single RP and the scheduler allocates from a specific RP we need

164 # to split the pools by PCI or PF address. We can still keep

165 # the VFs from the same parent PF in a single pool though as they

166 # are equivalent from placement perspective.

167 pool['address'] = dev.parent_addr or dev.address

168

169 # NOTE(gibi): parent_ifname acts like a tag during pci claim but

170 # not provided as part of the whitelist spec as it is auto detected

171 # by the virt driver.

172 # This key is used for match InstancePciRequest backed by neutron ports

173 # that has resource_request and therefore that has resource allocation

174 # already in placement.

175 if dev.extra_info.get('parent_ifname'): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true

176 pool['parent_ifname'] = dev.extra_info['parent_ifname']

177

178 self._ensure_remote_managed_tag(dev, pool)

179

180 return pool

181

182 def _get_pool_with_device_type_mismatch(

183 self, dev: 'objects.PciDevice',

184 ) -> ty.Optional[ty.Tuple[Pool, 'objects.PciDevice']]:

185 """Check for device type mismatch in the pools for a given device.

186

187 Return (pool, device) if device type does not match or a single None

188 if the device type matches.

189 """

190 for pool in self.pools:

191 for device in pool['devices']:

192 if device.address == dev.address:

193 if dev.dev_type != pool["dev_type"]:

194 return pool, device

195 return None

196

197 return None

198

199 def update_device(self, dev: 'objects.PciDevice') -> None:

200 """Update a device to its matching pool."""

201 pool_device_info = self._get_pool_with_device_type_mismatch(dev)

202 if pool_device_info is None:

203 return None

204

205 pool, device = pool_device_info

206 pool['devices'].remove(device)

207 self._decrease_pool_count(self.pools, pool)

208 self.add_device(dev)

209

210 def add_device(self, dev: 'objects.PciDevice') -> None:

211 """Add a device to its matching pool."""

212 dev_pool = self._create_pool_keys_from_dev(dev)

213 if dev_pool:

214 pool = self._find_pool(dev_pool)

215 if not pool:

216 dev_pool['count'] = 0

217 dev_pool['devices'] = []

218 self.pools.append(dev_pool)

219 self.pools.sort(key=lambda item: len(item))

220 pool = dev_pool

221 pool['count'] += 1

222 pool['devices'].append(dev)

223

224 @staticmethod

225 def _decrease_pool_count(

226 pool_list: ty.List[Pool], pool: Pool, count: int = 1,

227 ) -> int:

228 """Decrement pool's size by count.

229

230 If pool becomes empty, remove pool from pool_list.

231 """

232 if pool['count'] > count:

233 pool['count'] -= count

234 count = 0

235 else:

236 count -= pool['count']

237 pool_list.remove(pool)

238 return count

239

240 def remove_device(self, dev: 'objects.PciDevice') -> None:

241 """Remove one device from the first pool that it matches."""

242 dev_pool = self._create_pool_keys_from_dev(dev)

243 if dev_pool:

244 pool = self._find_pool(dev_pool)

245 if not pool:

246 raise exception.PciDevicePoolEmpty(

247 compute_node_id=dev.compute_node_id, address=dev.address)

248 pool['devices'].remove(dev)

249 self._decrease_pool_count(self.pools, pool)

250

251 def get_free_devs(self) -> ty.List['objects.PciDevice']:

252 free_devs: ty.List[objects.PciDevice] = []

253 for pool in self.pools:

254 free_devs.extend(pool['devices'])

255 return free_devs

256

257 def _allocate_devs(

258 self, pool: Pool, num: int, request_id: str

259 ) -> ty.List["objects.PciDevice"]:

260 alloc_devices = []

261 for _ in range(num):

262 pci_dev = pool['devices'].pop()

263 self._handle_device_dependents(pci_dev)

264 pci_dev.request_id = request_id

265 alloc_devices.append(pci_dev)

266 return alloc_devices

267

268 def consume_requests(

269 self,

270 pci_requests: 'objects.InstancePCIRequests',

271 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,

272 ) -> ty.Optional[ty.List['objects.PciDevice']]:

273

274 alloc_devices: ty.List[objects.PciDevice] = []

275

276 for request in pci_requests:

277 count = request.count

278

279 rp_uuids = self._get_rp_uuids_for_request(

280 request=request, provider_mapping=None)

281 pools = self._filter_pools(

282 self.pools, request, numa_cells, rp_uuids=rp_uuids)

283

284 # Failed to allocate the required number of devices. Return the

285 # devices already allocated during previous iterations back to

286 # their pools

287 if not pools:

288 LOG.error("Failed to allocate PCI devices for instance. "

289 "Unassigning devices back to pools. "

290 "This should not happen, since the scheduler "

291 "should have accurate information, and allocation "

292 "during claims is controlled via a hold "

293 "on the compute node semaphore.")

294 for d in range(len(alloc_devices)):

295 self.add_device(alloc_devices.pop())

296 raise exception.PciDeviceRequestFailed(requests=pci_requests)

297

298 if not rp_uuids:

299 # if there is no placement allocation then we are free to

300 # consume from the pools in any order:

301 for pool in pools: 301 ↛ 276line 301 didn't jump to line 276 because the loop on line 301 didn't complete

302 if pool['count'] >= count:

303 num_alloc = count

304 else:

305 num_alloc = pool['count']

306 count -= num_alloc

307 pool['count'] -= num_alloc

308 alloc_devices += self._allocate_devs(

309 pool, num_alloc, request.request_id)

310 if count == 0:

311 break

312 else:

313 # but if there is placement allocation then we have to follow

314 # it

315

316 if not self._assert_one_pool_per_rp_uuid(pools):

317 raise exception.PciDeviceRequestFailed(

318 requests=pci_requests)

319

320 requested_devs_per_pool_rp = collections.Counter(rp_uuids)

321 for pool in pools:

322 count = requested_devs_per_pool_rp[pool['rp_uuid']]

323 pool['count'] -= count

324 alloc_devices += self._allocate_devs(

325 pool, count, request.request_id)

326 # we consumed all the requested devices for the rp_uuid

327 # so we can drop that rp_uuid from the request.

328 requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)

329

330 return alloc_devices

331

332 def _handle_device_dependents(self, pci_dev: 'objects.PciDevice') -> None:

333 """Remove device dependents or a parent from pools.

334

335 In case the device is a PF, all of it's dependent VFs should

336 be removed from pools count, if these are present.

337 When the device is a VF, or a VDPA device, it's parent PF

338 pool count should be decreased, unless it is no longer in a pool.

339 """

340 if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:

341 vfs_list = pci_dev.child_devices

342 if vfs_list:

343 free_devs = self.get_free_devs()

344 for vf in vfs_list:

345 # NOTE(gibi): do not try to remove a device that are

346 # already removed

347 if vf in free_devs:

348 self.remove_device(vf)

349 elif pci_dev.dev_type in (

350 fields.PciDeviceType.SRIOV_VF,

351 fields.PciDeviceType.VDPA,

352 ):

353 try:

354 parent = pci_dev.parent_device

355 # Make sure not to decrease PF pool count if this parent has

356 # been already removed from pools

357 if parent in self.get_free_devs():

358 self.remove_device(parent)

359 except exception.PciDeviceNotFound:

360 return

361

362 def _filter_pools_for_spec(

363 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',

364 ) -> ty.List[Pool]:

365 """Filter out pools that don't match the request's device spec.

366

367 Exclude pools that do not match the specified ``vendor_id``,

368 ``product_id`` and/or ``device_type`` field, or any of the other

369 arbitrary tags such as ``physical_network``, specified in the request.

370

371 :param pools: A list of PCI device pool dicts

372 :param request: An InstancePCIRequest object describing the type,

373 quantity and required NUMA affinity of device(s) we want.

374 :returns: A list of pools that can be used to support the request if

375 this is possible.

376 """

377

378 def ignore_keys(spec):

379 return {

380 k: v

381 for k, v in spec.items()

382 if k not in self.ignored_spec_tags

383 }

384

385 request_specs = [ignore_keys(spec) for spec in request.spec]

386 return [

387 pool for pool in pools

388 if utils.pci_device_prop_match(pool, request_specs)

389 ]

390

391 def _filter_pools_for_numa_cells(

392 self,

393 pools: ty.List[Pool],

394 request: 'objects.InstancePCIRequest',

395 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],

396 ) -> ty.List[Pool]:

397 """Filter out pools with the wrong NUMA affinity, if required.

398

399 Exclude pools that do not have *suitable* PCI NUMA affinity.

400 ``numa_policy`` determines what *suitable* means, being one of

401 PREFERRED (nice-to-have), LEGACY (must-have-if-available) and REQUIRED

402 (must-have). We iterate through the various policies in order of

403 strictness. This means that even if we only *prefer* PCI-NUMA affinity,

404 we will still attempt to provide it if possible.

405

406 :param pools: A list of PCI device pool dicts

407 :param request: An InstancePCIRequest object describing the type,

408 quantity and required NUMA affinity of device(s) we want.

409 :param numa_cells: A list of InstanceNUMACell objects whose ``id``

410 corresponds to the ``id`` of host NUMACells.

411 :returns: A list of pools that can, together, provide at least

412 ``requested_count`` PCI devices with the level of NUMA affinity

413 required by ``numa_policy``, else all pools that can satisfy this

414 policy even if it's not enough.

415 """

416 if not numa_cells:

417 return pools

418

419 # we default to the 'legacy' policy for...of course...legacy reasons

420 requested_policy = fields.PCINUMAAffinityPolicy.LEGACY

421 if 'numa_policy' in request:

422 requested_policy = request.numa_policy or requested_policy

423

424 requested_count = request.count

425 numa_cell_ids = [cell.id for cell in numa_cells]

426

427 # filter out pools which numa_node is not included in numa_cell_ids

428 filtered_pools = [

429 pool for pool in pools if any(utils.pci_device_prop_match(

430 pool, [{'numa_node': cell}]) for cell in numa_cell_ids)]

431

432 # we can't apply a less strict policy than the one requested, so we

433 # need to return if we've demanded a NUMA affinity of REQUIRED.

434 # However, NUMA affinity is a good thing. If we can get enough devices

435 # with the stricter policy then we will use them.

436 if requested_policy == fields.PCINUMAAffinityPolicy.REQUIRED or sum(

437 pool['count'] for pool in filtered_pools) >= requested_count:

438 return filtered_pools

439

440 # the SOCKET policy is a bit of a special case. It's less strict than

441 # REQUIRED (so REQUIRED will automatically fulfil SOCKET, at least

442 # with our assumption of never having multiple sockets per NUMA node),

443 # but not always more strict than LEGACY: a PCI device with no NUMA

444 # affinity will fulfil LEGACY but not SOCKET. If we have SOCKET,

445 # process it here and don't continue.

446 if requested_policy == fields.PCINUMAAffinityPolicy.SOCKET: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true

447 return self._filter_pools_for_socket_affinity(pools, numa_cells)

448

449 # some systems don't report NUMA node info for PCI devices, in which

450 # case None is reported in 'pci_device.numa_node'. The LEGACY policy

451 # allows us to use these devices so we include None in the list of

452 # suitable NUMA cells.

453 numa_cell_ids.append(None)

454

455 # filter out pools which numa_node is not included in numa_cell_ids

456 filtered_pools = [

457 pool for pool in pools if any(utils.pci_device_prop_match(

458 pool, [{'numa_node': cell}]) for cell in numa_cell_ids)]

459

460 # once again, we can't apply a less strict policy than the one

461 # requested, so we need to return if we've demanded a NUMA affinity of

462 # LEGACY. Similarly, we will also return if we have enough devices to

463 # satisfy this somewhat strict policy.

464 if requested_policy == fields.PCINUMAAffinityPolicy.LEGACY or sum(

465 pool['count'] for pool in filtered_pools) >= requested_count:

466 return filtered_pools

467

468 # if we've got here, we're using the PREFERRED policy and weren't able

469 # to provide anything with stricter affinity. Use whatever devices you

470 # can, folks.

471 return sorted(

472 pools, key=lambda pool: pool.get('numa_node') not in numa_cell_ids)

473

474 def _filter_pools_for_socket_affinity(

475 self,

476 pools: ty.List[Pool],

477 numa_cells: ty.List['objects.InstanceNUMACell'],

478 ) -> ty.List[Pool]:

479 host_cells = self.numa_topology.cells

480 # bail early if we don't have socket information for all host_cells.

481 # This could happen if we're running on an weird older system with

482 # multiple sockets per NUMA node, which is a configuration that we

483 # explicitly chose not to support.

484 if any(cell.socket is None for cell in host_cells):

485 LOG.debug('No socket information in host NUMA cell(s).')

486 return []

487

488 # get a set of host sockets that the guest cells are in. Since guest

489 # cell IDs map to host cell IDs, we can just lookup the latter's

490 # socket.

491 socket_ids = set()

492 for guest_cell in numa_cells:

493 for host_cell in host_cells:

494 if guest_cell.id == host_cell.id: 494 ↛ 493line 494 didn't jump to line 493 because the condition on line 494 was always true

495 socket_ids.add(host_cell.socket)

496

497 # now get a set of host NUMA nodes that are in the above sockets

498 allowed_numa_nodes = set()

499 for host_cell in host_cells:

500 if host_cell.socket in socket_ids: 500 ↛ 499line 500 didn't jump to line 499 because the condition on line 500 was always true

501 allowed_numa_nodes.add(host_cell.id)

502

503 # filter out pools that are not in one of the correct host NUMA nodes.

504 return [

505 pool for pool in pools if any(

506 utils.pci_device_prop_match(pool, [{'numa_node': numa_node}])

507 for numa_node in allowed_numa_nodes

508 )

509 ]

510

511 def _filter_pools_for_unrequested_pfs(

512 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',

513 ) -> ty.List[Pool]:

514 """Filter out pools with PFs, unless these are required.

515

516 This is necessary in cases where PFs and VFs have the same product_id

517 and generally useful elsewhere.

518

519 :param pools: A list of PCI device pool dicts

520 :param request: An InstancePCIRequest object describing the type,

521 quantity and required NUMA affinity of device(s) we want.

522 :returns: A list of pools that can be used to support the request if

523 this is possible.

524 """

525 if all(

526 spec.get('dev_type') != fields.PciDeviceType.SRIOV_PF

527 for spec in request.spec

528 ):

529 pools = [

530 pool for pool in pools

531 if not pool.get('dev_type') == fields.PciDeviceType.SRIOV_PF

532 ]

533 return pools

534

535 def _filter_pools_for_unrequested_vdpa_devices(

536 self,

537 pools: ty.List[Pool],

538 request: 'objects.InstancePCIRequest',

539 ) -> ty.List[Pool]:

540 """Filter out pools with VDPA devices, unless these are required.

541

542 This is necessary as vdpa devices require special handling and

543 should not be allocated to generic pci device requests.

544

545 :param pools: A list of PCI device pool dicts

546 :param request: An InstancePCIRequest object describing the type,

547 quantity and required NUMA affinity of device(s) we want.

548 :returns: A list of pools that can be used to support the request if

549 this is possible.

550 """

551 if all(

552 spec.get('dev_type') != fields.PciDeviceType.VDPA

553 for spec in request.spec

554 ):

555 pools = [

556 pool for pool in pools

557 if not pool.get('dev_type') == fields.PciDeviceType.VDPA

558 ]

559 return pools

560

561 def _filter_pools_for_unrequested_remote_managed_devices(

562 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',

563 ) -> ty.List[Pool]:

564 """Filter out pools with remote_managed devices, unless requested.

565

566 Remote-managed devices are not usable for legacy SR-IOV or hardware

567 offload scenarios and must be excluded from allocation.

568

569 :param pools: A list of PCI device pool dicts

570 :param request: An InstancePCIRequest object describing the type,

571 quantity and required NUMA affinity of device(s) we want.

572 :returns: A list of pools that can be used to support the request if

573 this is possible.

574 """

575 if all(not strutils.bool_from_string(spec.get(PCI_REMOTE_MANAGED_TAG))

576 for spec in request.spec):

577 pools = [pool for pool in pools

578 if not strutils.bool_from_string(

579 pool.get(PCI_REMOTE_MANAGED_TAG))]

580 return pools

581

582 def _filter_pools_based_on_placement_allocation(

583 self,

584 pools: ty.List[Pool],

585 request: 'objects.InstancePCIRequest',

586 rp_uuids: ty.List[str],

587 ) -> ty.List[Pool]:

588 if not rp_uuids:

589 # If there is no placement allocation then we don't need to filter

590 # by it. This could happen if the instance only has neutron port

591 # based InstancePCIRequest as that is currently not having

592 # placement allocation (except for QoS ports, but that handled in a

593 # separate codepath) or if the [filter_scheduler]pci_in_placement

594 # configuration option is not enabled in the scheduler.

595 return pools

596

597 requested_dev_count_per_rp = collections.Counter(rp_uuids)

598 matching_pools = []

599 for pool in pools:

600 rp_uuid = pool.get('rp_uuid')

601 if rp_uuid is None: 601 ↛ 609line 601 didn't jump to line 609 because the condition on line 601 was never true

602 # NOTE(gibi): As rp_uuids is not empty the scheduler allocated

603 # PCI resources on this host, so we know that

604 # [pci]report_in_placement is enabled on this host. But this

605 # pool has no RP mapping which can only happen if the pool

606 # contains PCI devices with physical_network tag, as those

607 # devices not yet reported in placement. But if they are not

608 # reported then we can ignore them here too.

609 continue

610

611 if (

612 # the placement allocation contains this pool

613 rp_uuid in requested_dev_count_per_rp and

614 # the amount of dev allocated in placement can be consumed

615 # from the pool

616 pool["count"] >= requested_dev_count_per_rp[rp_uuid]

617 ):

618 matching_pools.append(pool)

619

620 return matching_pools

621

622 def _filter_pools_for_live_migratable_devices(

623 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',

624 ) -> ty.List[Pool]:

625 """Filter out pools with non live_migratable devices.

626

627 :param pools: A list of PCI device pool dicts

628 :param request: An InstancePCIRequest object describing the type,

629 quantity and required NUMA affinity of device(s) we want.

630 :returns: A list of pools that can be used to support the request if

631 this is possible.

632 """

633

634 # The following code handles the case where 'live_migratable' is

635 # set (either "true" or "false") by filtering devices to select the

636 # appropriate ones.

637 # If it is not set, we skip the next code block and no filtering

638 # is applied to the pools.

639 if all(spec.get("live_migratable") == 'true' for spec in request.spec):

640 # if all specs require live migratable devices, then we need to

641 # reduce the pools by the ones that support them.

642 pools = [pool for pool in pools if pool.get("live_migratable") and

643 pool['live_migratable'] == 'true']

644 elif all(

645 spec.get("live_migratable") == "false" for spec in request.spec

646 ):

647 # If the request asks to NOT support live-migratable devices, then

648 # we don't provide the ones that support them.

649 # We want to exclude the devices that don't have this value yet.

650 pools = [pool for pool in pools if pool.get("live_migratable") and

651 pool['live_migratable'] == 'false']

652 return pools

653

654 def _filter_pools(

655 self,

656 pools: ty.List[Pool],

657 request: 'objects.InstancePCIRequest',

658 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],

659 rp_uuids: ty.List[str],

660 ) -> ty.Optional[ty.List[Pool]]:

661 """Determine if an individual PCI request can be met.

662

663 Filter pools, which are collections of devices with similar traits, to

664 identify those that can support the provided PCI request.

665

666 If ``numa_cells`` is provided then NUMA locality may be taken into

667 account, depending on the value of ``request.numa_policy``.

668

669 :param pools: A list of PCI device pool dicts

670 :param request: An InstancePCIRequest object describing the type,

671 quantity and required NUMA affinity of device(s) we want.

672 :param numa_cells: A list of InstanceNUMACell objects whose ``id``

673 corresponds to the ``id`` of host NUMACell objects.

674 :param rp_uuids: A list of PR uuids this request fulfilled from in

675 placement. So here we have to consider only the pools matching with

676 these RP uuids

677 :returns: A list of pools that can be used to support the request if

678 this is possible, else None.

679 """

680 # NOTE(vladikr): This code may be open to race conditions.

681 # Two concurrent requests may succeed when called support_requests

682 # because this method does not remove related devices from the pools

683

684 # Firstly, let's exclude all devices that don't match our spec (e.g.

685 # they've got different PCI IDs or something)

686 before_count = sum([pool['count'] for pool in pools])

687 pools = self._filter_pools_for_spec(pools, request)

688 after_count = sum([pool['count'] for pool in pools])

689

690 if after_count < before_count:

691 LOG.debug(

692 'Dropped %d device(s) due to mismatched PCI attribute(s)',

693 before_count - after_count

694 )

695

696 if after_count < request.count:

697 LOG.debug('Not enough PCI devices left to satisfy request')

698 return None

699

700 # Next, let's exclude all devices that aren't on the correct NUMA node

701 # or socket, *assuming* we have devices and care about that, as

702 # determined by policy

703 before_count = after_count

704 pools = self._filter_pools_for_numa_cells(pools, request, numa_cells)

705 after_count = sum([pool['count'] for pool in pools])

706

707 if after_count < before_count:

708 LOG.debug(

709 'Dropped %d device(s) as they are on the wrong NUMA node(s)',

710 before_count - after_count

711 )

712

713 if after_count < request.count:

714 LOG.debug('Not enough PCI devices left to satisfy request')

715 return None

716

717 # If we're not requesting PFs then we should not use these.

718 # Exclude them.

719 before_count = after_count

720 pools = self._filter_pools_for_unrequested_pfs(pools, request)

721 after_count = sum([pool['count'] for pool in pools])

722

723 if after_count < before_count:

724 LOG.debug(

725 'Dropped %d device(s) as they are PFs which we have not '

726 'requested',

727 before_count - after_count

728 )

729

730 if after_count < request.count: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true

731 LOG.debug('Not enough PCI devices left to satisfy request')

732 return None

733

734 # If we're not requesting VDPA devices then we should not use these

735 # either. Exclude them.

736 before_count = after_count

737 pools = self._filter_pools_for_unrequested_vdpa_devices(pools, request)

738 after_count = sum([pool['count'] for pool in pools])

739

740 if after_count < before_count:

741 LOG.debug(

742 'Dropped %d device(s) as they are VDPA devices which we have '

743 'not requested',

744 before_count - after_count

745 )

746

747 # If we're not requesting remote_managed devices then we should not

748 # use these either. Exclude them.

749 before_count = after_count

750 pools = self._filter_pools_for_unrequested_remote_managed_devices(

751 pools, request)

752 after_count = sum([pool['count'] for pool in pools])

753

754 if after_count < before_count: 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true

755 LOG.debug(

756 'Dropped %d device(s) as they are remote-managed devices which'

757 'we have not requested',

758 before_count - after_count

759 )

760

761 # if there is placement allocation for the request then we have to

762 # remove the pools that are not in the placement allocation

763 before_count = after_count

764 pools = self._filter_pools_based_on_placement_allocation(

765 pools, request, rp_uuids)

766 after_count = sum([pool['count'] for pool in pools])

767 if after_count < before_count:

768 LOG.debug(

769 'Dropped %d device(s) that are not part of the placement '

770 'allocation',

771 before_count - after_count

772 )

773

774 before_count = after_count

775 pools = self._filter_pools_for_live_migratable_devices(

776 pools, request)

777 after_count = sum([pool['count'] for pool in pools])

778 if after_count < before_count: 778 ↛ 779line 778 didn't jump to line 779 because the condition on line 778 was never true

779 LOG.debug(

780 'Dropped %d device(s) that are not live migratable',

781 before_count - after_count

782 )

783

784 if after_count < request.count:

785 LOG.debug('Not enough PCI devices left to satisfy request')

786 return None

787

788 return pools

789

790 def support_requests(

791 self,

792 requests: ty.List['objects.InstancePCIRequest'],

793 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],

794 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,

795 ) -> bool:

796 """Determine if the PCI requests can be met.

797

798 Determine, based on a compute node's PCI stats, if an instance can be

799 scheduled on the node. **Support does not mean real allocation**.

800

801 If ``numa_cells`` is provided then NUMA locality may be taken into

802 account, depending on the value of ``numa_policy``.

803

804 :param requests: A list of InstancePCIRequest object describing the

805 types, quantities and required NUMA affinities of devices we want.

806 :type requests: nova.objects.InstancePCIRequests

807 :param provider_mapping: A dict keyed by RequestGroup requester_id,

808 to a list of resource provider UUIDs which provide resource

809 for that RequestGroup. If it is None then it signals that the

810 InstancePCIRequest objects already stores a mapping per request.

811 I.e.: we are called _after_ the scheduler made allocations for this

812 request in placement.

813 :param numa_cells: A list of InstanceNUMACell objects whose ``id``

814 corresponds to the ``id`` of host NUMACells, or None.

815 :returns: Whether this compute node can satisfy the given request.

816 """

817

818 # try to apply the requests on the copy of the stats if it applies

819 # cleanly then we know that the requests is supported. We call apply

820 # only on a copy as we don't want to actually consume resources from

821 # the pool as at this point this is just a test during host filtering.

822 # Later the scheduler will call apply_request to consume on the

823 # selected host. The compute will call consume_request during PCI claim

824 # to consume not just from the pools but also consume PciDevice

825 # objects.

826 stats = copy.deepcopy(self)

827 try:

828 stats.apply_requests(requests, provider_mapping, numa_cells)

829 except exception.PciDeviceRequestFailed:

830 return False

831

832 return True

833

834 def _apply_request(

835 self,

836 pools: ty.List[Pool],

837 request: 'objects.InstancePCIRequest',

838 rp_uuids: ty.List[str],

839 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,

840 ) -> bool:

841 """Apply an individual PCI request.

842

843 Apply a PCI request against a given set of PCI device pools, which are

844 collections of devices with similar traits.

845

846 If ``numa_cells`` is provided then NUMA locality may be taken into

847 account, depending on the value of ``request.numa_policy``.

848

849 :param pools: A list of PCI device pool dicts

850 :param request: An InstancePCIRequest object describing the type,

851 quantity and required NUMA affinity of device(s) we want.

852 :param rp_uuids: A list of PR uuids this request fulfilled from in

853 placement

854 :param numa_cells: A list of InstanceNUMACell objects whose ``id``

855 corresponds to the ``id`` of host NUMACell objects.

856 :returns: True if the request was applied against the provided pools

857 successfully, else False.

858 """

859 # NOTE(vladikr): This code maybe open to race conditions.

860 # Two concurrent requests may succeed when called support_requests

861 # because this method does not remove related devices from the pools

862

863 filtered_pools = self._filter_pools(

864 pools, request, numa_cells, rp_uuids)

865

866 if not filtered_pools:

867 return False

868

869 if not rp_uuids:

870 # If there is no placement allocation for this request then we are

871 # free to consume from the filtered pools in any order

872 count = request.count

873 for pool in filtered_pools: 873 ↛ 894line 873 didn't jump to line 894 because the loop on line 873 didn't complete

874 count = self._decrease_pool_count(pools, pool, count)

875 if not count:

876 break

877 else:

878 # but if there is placement allocation then we have to follow that

879

880 if not self._assert_one_pool_per_rp_uuid(pools):

881 return False

882

883 requested_devs_per_pool_rp = collections.Counter(rp_uuids)

884 for pool in filtered_pools:

885 count = requested_devs_per_pool_rp[pool['rp_uuid']]

886 pool['count'] -= count

887 # we consumed all the requested devices for the rp_uuid

888 # so we can drop that rp_uuid from the request.

889 requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)

890

891 if pool['count'] == 0:

892 pools.remove(pool)

893

894 return True

895

896 def _get_rp_uuids_for_request(

897 self,

898 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],

899 request: 'objects.InstancePCIRequest'

900 ) -> ty.List[str]:

901 """Return the list of RP uuids that are fulfilling the request.

902

903 An RP will be in the list as many times as many devices needs to

904 be allocated from that RP.

905 """

906

907 if request.source == objects.InstancePCIRequest.NEUTRON_PORT:

908 # TODO(gibi): support neutron based requests in a later cycle

909 # an empty list will signal that any PCI pool can be used for this

910 # request

911 return []

912

913 if not provider_mapping:

914 # NOTE(gibi): AFAIK specs is always a list of a single dict

915 # but the object is hard to change retroactively

916 rp_uuids = request.spec[0].get('rp_uuids')

917 if not rp_uuids:

918 # This can happen if [filter_scheduler]pci_in_placement is not

919 # enabled yet

920 # An empty list will signal that any PCI pool can be used for

921 # this request

922 return []

923

924 # TODO(gibi): this is baaad but spec is a dict of string so

925 # the list is serialized

926 return rp_uuids.split(',')

927

928 # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from

929 # InstancePCIRequests in the form of {request_id}-{count_index}

930 # NOTE(gibi): a suffixed request group always fulfilled from a single

931 # RP

932 return [

933 rp_uuids[0]

934 for group_id, rp_uuids in provider_mapping.items()

935 if group_id.startswith(request.request_id)

936 ]

937

938 def apply_requests(

939 self,

940 requests: ty.List['objects.InstancePCIRequest'],

941 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],

942 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,

943 ) -> None:

944 """Apply PCI requests to the PCI stats.

945

946 This is used in multiple instance creation, when the scheduler has to

947 maintain how the resources are consumed by the instances.

948

949 If ``numa_cells`` is provided then NUMA locality may be taken into

950 account, depending on the value of ``numa_policy``.

951

952 :param requests: A list of InstancePCIRequest object describing the

953 types, quantities and required NUMA affinities of devices we want.

954 :type requests: nova.objects.InstancePCIRequests

955 :param provider_mapping: A dict keyed by RequestGroup requester_id,

956 to a list of resource provider UUIDs which provide resource

957 for that RequestGroup. If it is None then it signals that the

958 InstancePCIRequest objects already stores a mapping per request.

959 I.e.: we are called _after_ the scheduler made allocations for this

960 request in placement.

961 :param numa_cells: A list of InstanceNUMACell objects whose ``id``

962 corresponds to the ``id`` of host NUMACells, or None.

963 :raises: exception.PciDeviceRequestFailed if this compute node cannot

964 satisfy the given request.

965 """

966

967 for r in requests:

968 rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r)

969

970 if not self._apply_request(self.pools, r, rp_uuids, numa_cells):

971 raise exception.PciDeviceRequestFailed(requests=requests)

972

973 def __iter__(self) -> ty.Iterator[Pool]:

974 pools: ty.List[Pool] = []

975 for pool in self.pools:

976 pool = copy.deepcopy(pool)

977 # 'devices' shouldn't be part of stats

978 if 'devices' in pool:

979 del pool['devices']

980 pools.append(pool)

981 return iter(pools)

982

983 def clear(self) -> None:

984 """Clear all the stats maintained."""

985 self.pools = []

986

987 def __eq__(self, other: object) -> bool:

988 if not isinstance(other, PciDeviceStats): 988 ↛ 989line 988 didn't jump to line 989 because the condition on line 988 was never true

989 return NotImplemented

990 return self.pools == other.pools

991

992 def to_device_pools_obj(self) -> 'objects.PciDevicePoolList':

993 """Return the contents of the pools as a PciDevicePoolList object."""

994 stats = [x for x in self]

995 return pci_device_pool.from_pci_stats(stats)

996

997 def has_remote_managed_device_pools(self) -> bool:

998 """Determine whether remote managed device pools are present on a host.

999

1000 The check is pool-based, not free device-based and is NUMA cell

1001 agnostic.

1002 """

1003 dummy_req = objects.InstancePCIRequest(

1004 count=0,

1005 spec=[{'remote_managed': True}]

1006 )

1007 pools = self._filter_pools_for_spec(self.pools, dummy_req)

1008 return bool(pools)

1009

1010 def populate_pools_metadata_from_assigned_devices(self):

1011 """Populate the rp_uuid of each pool based on the rp_uuid of the

1012 devices assigned to the pool. This can only be called from the compute

1013 where devices are assigned to each pool. This should not be called from

1014 the scheduler as there device - pool assignment is not known.

1015 """

1016 # PciDevices are tracked in placement and flavor based PCI requests

1017 # are scheduled and allocated in placement. To be able to correlate

1018 # what is allocated in placement and what is consumed in nova we

1019 # need to map device pools to RPs. We can do that as the PciDevice

1020 # contains the RP UUID that represents it in placement.

1021 # NOTE(gibi): We cannot do this when the device is originally added to

1022 # the pool as the device -> placement translation, that creates the

1023 # RPs, runs after all the device is created and assigned to pools.

1024 for pool in self.pools:

1025 pool_rps = {

1026 dev.extra_info.get("rp_uuid")

1027 for dev in pool["devices"]

1028 if "rp_uuid" in dev.extra_info

1029 }

1030 if len(pool_rps) >= 2:

1031 # FIXME(gibi): Do we have a 1:1 pool - RP mapping even

1032 # if two PFs providing very similar VFs?

1033 raise ValueError(

1034 "We have a pool %s connected to more than one RPs %s in "

1035 "placement via devs %s" % (pool, pool_rps, pool["devices"])

1036 )

1037

1038 if not pool_rps:

1039 # this can happen if the nova-compute is upgraded to have the

1040 # PCI in placement inventory handling code but

1041 # [pci]report_in_placement is not turned on yet.

1042 continue

1043

1044 if pool_rps: # now we know that it is a single RP 1044 ↛ 1024line 1044 didn't jump to line 1024 because the condition on line 1044 was always true

1045 pool['rp_uuid'] = next(iter(pool_rps))

1046

1047 @staticmethod

1048 def _assert_one_pool_per_rp_uuid(pools: ty.List[Pool]) -> bool:

1049 """Asserts that each pool has a unique rp_uuid if any

1050

1051 :param pools: A list of Pool objects.

1052 :return: True if each pool has a unique rp_uuid or no rp_uuid assigned,

1053 False otherwise.

1054 """

1055 pools_per_rp_uuid = collections.defaultdict(list)

1056 for pool in pools:

1057 if "rp_uuid" in pool:

1058 pools_per_rp_uuid[pool["rp_uuid"]].append(pool)

1059

1060 split_rp_uuids = {

1061 rp_uuid: pools

1062 for rp_uuid, pools in pools_per_rp_uuid.items()

1063 if len(pools) > 1}

1064

1065 if split_rp_uuids:

1066 LOG.warning(

1067 "The PCI allocation logic assumes that devices "

1068 "related to the same rp_uuid are in the same pool. "

1069 "However the following rp_uuids are split across multiple "

1070 "pools. This should not happen. Please file a bug report. %s",

1071 split_rp_uuids

1072 )

1073 return not split_rp_uuids