Coverage for nova/pci/stats.py: 95%
357 statements
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
« prev ^ index » next coverage.py v7.6.12, created at 2025-04-24 11:16 +0000
1# Copyright (c) 2013 Intel, Inc.
2# Copyright (c) 2013 OpenStack Foundation
3# All Rights Reserved.
4#
5# Licensed under the Apache License, Version 2.0 (the "License"); you may
6# not use this file except in compliance with the License. You may obtain
7# a copy of the License at
8#
9# http://www.apache.org/licenses/LICENSE-2.0
10#
11# Unless required by applicable law or agreed to in writing, software
12# distributed under the License is distributed on an "AS IS" BASIS, WITHOUT
13# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the
14# License for the specific language governing permissions and limitations
15# under the License.
16import collections
17import copy
18import typing as ty
20from oslo_config import cfg
21from oslo_log import log as logging
22from oslo_utils import strutils
24from nova import exception
25from nova import objects
26from nova.objects import fields
27from nova.objects import pci_device_pool
28from nova.pci.request import PCI_REMOTE_MANAGED_TAG
29from nova.pci import utils
30from nova.pci import whitelist
32CONF = cfg.CONF
33LOG = logging.getLogger(__name__)
36# TODO(stephenfin): We might want to use TypedDict here. Refer to
37# https://mypy.readthedocs.io/en/latest/kinds_of_types.html#typeddict for
38# more information.
39Pool = ty.Dict[str, ty.Any]
42class PciDeviceStats(object):
44 """PCI devices summary information.
46 According to the PCI SR-IOV spec, a PCI physical function can have up to
47 256 PCI virtual functions, thus the number of assignable PCI functions in
48 a cloud can be big. The scheduler needs to know all device availability
49 information in order to determine which compute hosts can support a PCI
50 request. Passing individual virtual device information to the scheduler
51 does not scale, so we provide summary information.
53 Usually the virtual functions provided by a host PCI device have the same
54 value for most properties, like vendor_id, product_id and class type.
55 The PCI stats class summarizes this information for the scheduler.
57 The pci stats information is maintained exclusively by compute node
58 resource tracker and updated to database. The scheduler fetches the
59 information and selects the compute node accordingly. If a compute
60 node is selected, the resource tracker allocates the devices to the
61 instance and updates the pci stats information.
63 This summary information will be helpful for cloud management also.
64 """
66 pool_keys = ['product_id', 'vendor_id', 'numa_node', 'dev_type']
67 # these can be specified in the [pci]device_spec and can be requested via
68 # the PCI alias, but they are matched by the placement
69 # allocation_candidates query, so we can ignore them during pool creation
70 # and during filtering here
71 ignored_spec_tags = ignored_pool_tags = ['resource_class', 'traits']
72 # this is a metadata key in the spec that is matched
73 # specially in _filter_pools_based_on_placement_allocation. So we can
74 # ignore them in the general matching logic.
75 ignored_spec_tags += ['rp_uuids']
76 # this is a metadata key in the pool that is matched
77 # specially in _filter_pools_based_on_placement_allocation. So we can
78 # ignore them in the general matching logic.
79 ignored_pool_tags += ['rp_uuid']
81 def __init__(
82 self,
83 numa_topology: 'objects.NUMATopology',
84 stats: 'objects.PCIDevicePoolList' = None,
85 dev_filter: ty.Optional[whitelist.Whitelist] = None,
86 ) -> None:
87 self.numa_topology = numa_topology
88 self.pools = (
89 [pci_pool.to_dict() for pci_pool in stats] if stats else []
90 )
91 self.pools.sort(key=lambda item: len(item))
92 self.dev_filter = dev_filter or whitelist.Whitelist(
93 CONF.pci.device_spec)
95 def _equal_properties(
96 self, dev: Pool, entry: Pool, matching_keys: ty.List[str],
97 ) -> bool:
98 return all(dev.get(prop) == entry.get(prop)
99 for prop in matching_keys)
101 def _find_pool(self, dev_pool: Pool) -> ty.Optional[Pool]:
102 """Return the first pool that matches dev."""
103 for pool in self.pools:
104 pool_keys = pool.copy()
105 del pool_keys['count']
106 del pool_keys['devices']
107 for tag in self.ignored_pool_tags:
108 pool_keys.pop(tag, None)
110 if (len(pool_keys.keys()) == len(dev_pool.keys()) and
111 self._equal_properties(dev_pool, pool_keys, list(dev_pool))):
112 return pool
114 return None
116 @staticmethod
117 def _ensure_remote_managed_tag(
118 dev: 'objects.PciDevice', pool: Pool):
119 """Add a remote_managed tag depending on a device type if needed.
121 Network devices may be managed remotely, e.g. by a SmartNIC DPU. If
122 a tag has not been explicitly provided, populate it by assuming that
123 a device is not remote managed by default.
124 """
125 if dev.dev_type not in (fields.PciDeviceType.SRIOV_VF,
126 fields.PciDeviceType.SRIOV_PF,
127 fields.PciDeviceType.VDPA):
128 return
130 # A tag is added here rather than at the client side to avoid an
131 # issue with having objects without this tag specified during an
132 # upgrade to the first version that supports handling this tag.
133 if pool.get(PCI_REMOTE_MANAGED_TAG) is None:
134 # NOTE: tags are compared as strings case-insensitively, see
135 # pci_device_prop_match in nova/pci/utils.py.
136 pool[PCI_REMOTE_MANAGED_TAG] = 'false'
138 def _create_pool_keys_from_dev(
139 self, dev: 'objects.PciDevice',
140 ) -> ty.Optional[Pool]:
141 """Create a stats pool dict that this dev is supposed to be part of
143 Note that this pool dict contains the stats pool's keys and their
144 values. 'count' and 'devices' are not included.
145 """
146 # Don't add a device that doesn't have a matching device spec.
147 # This can happen during initial sync up with the controller
148 devspec = self.dev_filter.get_devspec(dev)
149 if not devspec:
150 return None
151 tags = devspec.get_tags()
152 pool = {k: getattr(dev, k) for k in self.pool_keys}
154 if tags:
155 pool.update(
156 {
157 k: v
158 for k, v in tags.items()
159 if k not in self.ignored_pool_tags
160 }
161 )
162 # NOTE(gibi): since PCI in placement maps a PCI dev or a PF to a
163 # single RP and the scheduler allocates from a specific RP we need
164 # to split the pools by PCI or PF address. We can still keep
165 # the VFs from the same parent PF in a single pool though as they
166 # are equivalent from placement perspective.
167 pool['address'] = dev.parent_addr or dev.address
169 # NOTE(gibi): parent_ifname acts like a tag during pci claim but
170 # not provided as part of the whitelist spec as it is auto detected
171 # by the virt driver.
172 # This key is used for match InstancePciRequest backed by neutron ports
173 # that has resource_request and therefore that has resource allocation
174 # already in placement.
175 if dev.extra_info.get('parent_ifname'): 175 ↛ 176line 175 didn't jump to line 176 because the condition on line 175 was never true
176 pool['parent_ifname'] = dev.extra_info['parent_ifname']
178 self._ensure_remote_managed_tag(dev, pool)
180 return pool
182 def _get_pool_with_device_type_mismatch(
183 self, dev: 'objects.PciDevice',
184 ) -> ty.Optional[ty.Tuple[Pool, 'objects.PciDevice']]:
185 """Check for device type mismatch in the pools for a given device.
187 Return (pool, device) if device type does not match or a single None
188 if the device type matches.
189 """
190 for pool in self.pools:
191 for device in pool['devices']:
192 if device.address == dev.address:
193 if dev.dev_type != pool["dev_type"]:
194 return pool, device
195 return None
197 return None
199 def update_device(self, dev: 'objects.PciDevice') -> None:
200 """Update a device to its matching pool."""
201 pool_device_info = self._get_pool_with_device_type_mismatch(dev)
202 if pool_device_info is None:
203 return None
205 pool, device = pool_device_info
206 pool['devices'].remove(device)
207 self._decrease_pool_count(self.pools, pool)
208 self.add_device(dev)
210 def add_device(self, dev: 'objects.PciDevice') -> None:
211 """Add a device to its matching pool."""
212 dev_pool = self._create_pool_keys_from_dev(dev)
213 if dev_pool:
214 pool = self._find_pool(dev_pool)
215 if not pool:
216 dev_pool['count'] = 0
217 dev_pool['devices'] = []
218 self.pools.append(dev_pool)
219 self.pools.sort(key=lambda item: len(item))
220 pool = dev_pool
221 pool['count'] += 1
222 pool['devices'].append(dev)
224 @staticmethod
225 def _decrease_pool_count(
226 pool_list: ty.List[Pool], pool: Pool, count: int = 1,
227 ) -> int:
228 """Decrement pool's size by count.
230 If pool becomes empty, remove pool from pool_list.
231 """
232 if pool['count'] > count:
233 pool['count'] -= count
234 count = 0
235 else:
236 count -= pool['count']
237 pool_list.remove(pool)
238 return count
240 def remove_device(self, dev: 'objects.PciDevice') -> None:
241 """Remove one device from the first pool that it matches."""
242 dev_pool = self._create_pool_keys_from_dev(dev)
243 if dev_pool:
244 pool = self._find_pool(dev_pool)
245 if not pool:
246 raise exception.PciDevicePoolEmpty(
247 compute_node_id=dev.compute_node_id, address=dev.address)
248 pool['devices'].remove(dev)
249 self._decrease_pool_count(self.pools, pool)
251 def get_free_devs(self) -> ty.List['objects.PciDevice']:
252 free_devs: ty.List[objects.PciDevice] = []
253 for pool in self.pools:
254 free_devs.extend(pool['devices'])
255 return free_devs
257 def _allocate_devs(
258 self, pool: Pool, num: int, request_id: str
259 ) -> ty.List["objects.PciDevice"]:
260 alloc_devices = []
261 for _ in range(num):
262 pci_dev = pool['devices'].pop()
263 self._handle_device_dependents(pci_dev)
264 pci_dev.request_id = request_id
265 alloc_devices.append(pci_dev)
266 return alloc_devices
268 def consume_requests(
269 self,
270 pci_requests: 'objects.InstancePCIRequests',
271 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
272 ) -> ty.Optional[ty.List['objects.PciDevice']]:
274 alloc_devices: ty.List[objects.PciDevice] = []
276 for request in pci_requests:
277 count = request.count
279 rp_uuids = self._get_rp_uuids_for_request(
280 request=request, provider_mapping=None)
281 pools = self._filter_pools(
282 self.pools, request, numa_cells, rp_uuids=rp_uuids)
284 # Failed to allocate the required number of devices. Return the
285 # devices already allocated during previous iterations back to
286 # their pools
287 if not pools:
288 LOG.error("Failed to allocate PCI devices for instance. "
289 "Unassigning devices back to pools. "
290 "This should not happen, since the scheduler "
291 "should have accurate information, and allocation "
292 "during claims is controlled via a hold "
293 "on the compute node semaphore.")
294 for d in range(len(alloc_devices)):
295 self.add_device(alloc_devices.pop())
296 raise exception.PciDeviceRequestFailed(requests=pci_requests)
298 if not rp_uuids:
299 # if there is no placement allocation then we are free to
300 # consume from the pools in any order:
301 for pool in pools: 301 ↛ 276line 301 didn't jump to line 276 because the loop on line 301 didn't complete
302 if pool['count'] >= count:
303 num_alloc = count
304 else:
305 num_alloc = pool['count']
306 count -= num_alloc
307 pool['count'] -= num_alloc
308 alloc_devices += self._allocate_devs(
309 pool, num_alloc, request.request_id)
310 if count == 0:
311 break
312 else:
313 # but if there is placement allocation then we have to follow
314 # it
316 if not self._assert_one_pool_per_rp_uuid(pools):
317 raise exception.PciDeviceRequestFailed(
318 requests=pci_requests)
320 requested_devs_per_pool_rp = collections.Counter(rp_uuids)
321 for pool in pools:
322 count = requested_devs_per_pool_rp[pool['rp_uuid']]
323 pool['count'] -= count
324 alloc_devices += self._allocate_devs(
325 pool, count, request.request_id)
326 # we consumed all the requested devices for the rp_uuid
327 # so we can drop that rp_uuid from the request.
328 requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
330 return alloc_devices
332 def _handle_device_dependents(self, pci_dev: 'objects.PciDevice') -> None:
333 """Remove device dependents or a parent from pools.
335 In case the device is a PF, all of it's dependent VFs should
336 be removed from pools count, if these are present.
337 When the device is a VF, or a VDPA device, it's parent PF
338 pool count should be decreased, unless it is no longer in a pool.
339 """
340 if pci_dev.dev_type == fields.PciDeviceType.SRIOV_PF:
341 vfs_list = pci_dev.child_devices
342 if vfs_list:
343 free_devs = self.get_free_devs()
344 for vf in vfs_list:
345 # NOTE(gibi): do not try to remove a device that are
346 # already removed
347 if vf in free_devs:
348 self.remove_device(vf)
349 elif pci_dev.dev_type in (
350 fields.PciDeviceType.SRIOV_VF,
351 fields.PciDeviceType.VDPA,
352 ):
353 try:
354 parent = pci_dev.parent_device
355 # Make sure not to decrease PF pool count if this parent has
356 # been already removed from pools
357 if parent in self.get_free_devs():
358 self.remove_device(parent)
359 except exception.PciDeviceNotFound:
360 return
362 def _filter_pools_for_spec(
363 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',
364 ) -> ty.List[Pool]:
365 """Filter out pools that don't match the request's device spec.
367 Exclude pools that do not match the specified ``vendor_id``,
368 ``product_id`` and/or ``device_type`` field, or any of the other
369 arbitrary tags such as ``physical_network``, specified in the request.
371 :param pools: A list of PCI device pool dicts
372 :param request: An InstancePCIRequest object describing the type,
373 quantity and required NUMA affinity of device(s) we want.
374 :returns: A list of pools that can be used to support the request if
375 this is possible.
376 """
378 def ignore_keys(spec):
379 return {
380 k: v
381 for k, v in spec.items()
382 if k not in self.ignored_spec_tags
383 }
385 request_specs = [ignore_keys(spec) for spec in request.spec]
386 return [
387 pool for pool in pools
388 if utils.pci_device_prop_match(pool, request_specs)
389 ]
391 def _filter_pools_for_numa_cells(
392 self,
393 pools: ty.List[Pool],
394 request: 'objects.InstancePCIRequest',
395 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],
396 ) -> ty.List[Pool]:
397 """Filter out pools with the wrong NUMA affinity, if required.
399 Exclude pools that do not have *suitable* PCI NUMA affinity.
400 ``numa_policy`` determines what *suitable* means, being one of
401 PREFERRED (nice-to-have), LEGACY (must-have-if-available) and REQUIRED
402 (must-have). We iterate through the various policies in order of
403 strictness. This means that even if we only *prefer* PCI-NUMA affinity,
404 we will still attempt to provide it if possible.
406 :param pools: A list of PCI device pool dicts
407 :param request: An InstancePCIRequest object describing the type,
408 quantity and required NUMA affinity of device(s) we want.
409 :param numa_cells: A list of InstanceNUMACell objects whose ``id``
410 corresponds to the ``id`` of host NUMACells.
411 :returns: A list of pools that can, together, provide at least
412 ``requested_count`` PCI devices with the level of NUMA affinity
413 required by ``numa_policy``, else all pools that can satisfy this
414 policy even if it's not enough.
415 """
416 if not numa_cells:
417 return pools
419 # we default to the 'legacy' policy for...of course...legacy reasons
420 requested_policy = fields.PCINUMAAffinityPolicy.LEGACY
421 if 'numa_policy' in request:
422 requested_policy = request.numa_policy or requested_policy
424 requested_count = request.count
425 numa_cell_ids = [cell.id for cell in numa_cells]
427 # filter out pools which numa_node is not included in numa_cell_ids
428 filtered_pools = [
429 pool for pool in pools if any(utils.pci_device_prop_match(
430 pool, [{'numa_node': cell}]) for cell in numa_cell_ids)]
432 # we can't apply a less strict policy than the one requested, so we
433 # need to return if we've demanded a NUMA affinity of REQUIRED.
434 # However, NUMA affinity is a good thing. If we can get enough devices
435 # with the stricter policy then we will use them.
436 if requested_policy == fields.PCINUMAAffinityPolicy.REQUIRED or sum(
437 pool['count'] for pool in filtered_pools) >= requested_count:
438 return filtered_pools
440 # the SOCKET policy is a bit of a special case. It's less strict than
441 # REQUIRED (so REQUIRED will automatically fulfil SOCKET, at least
442 # with our assumption of never having multiple sockets per NUMA node),
443 # but not always more strict than LEGACY: a PCI device with no NUMA
444 # affinity will fulfil LEGACY but not SOCKET. If we have SOCKET,
445 # process it here and don't continue.
446 if requested_policy == fields.PCINUMAAffinityPolicy.SOCKET: 446 ↛ 447line 446 didn't jump to line 447 because the condition on line 446 was never true
447 return self._filter_pools_for_socket_affinity(pools, numa_cells)
449 # some systems don't report NUMA node info for PCI devices, in which
450 # case None is reported in 'pci_device.numa_node'. The LEGACY policy
451 # allows us to use these devices so we include None in the list of
452 # suitable NUMA cells.
453 numa_cell_ids.append(None)
455 # filter out pools which numa_node is not included in numa_cell_ids
456 filtered_pools = [
457 pool for pool in pools if any(utils.pci_device_prop_match(
458 pool, [{'numa_node': cell}]) for cell in numa_cell_ids)]
460 # once again, we can't apply a less strict policy than the one
461 # requested, so we need to return if we've demanded a NUMA affinity of
462 # LEGACY. Similarly, we will also return if we have enough devices to
463 # satisfy this somewhat strict policy.
464 if requested_policy == fields.PCINUMAAffinityPolicy.LEGACY or sum(
465 pool['count'] for pool in filtered_pools) >= requested_count:
466 return filtered_pools
468 # if we've got here, we're using the PREFERRED policy and weren't able
469 # to provide anything with stricter affinity. Use whatever devices you
470 # can, folks.
471 return sorted(
472 pools, key=lambda pool: pool.get('numa_node') not in numa_cell_ids)
474 def _filter_pools_for_socket_affinity(
475 self,
476 pools: ty.List[Pool],
477 numa_cells: ty.List['objects.InstanceNUMACell'],
478 ) -> ty.List[Pool]:
479 host_cells = self.numa_topology.cells
480 # bail early if we don't have socket information for all host_cells.
481 # This could happen if we're running on an weird older system with
482 # multiple sockets per NUMA node, which is a configuration that we
483 # explicitly chose not to support.
484 if any(cell.socket is None for cell in host_cells):
485 LOG.debug('No socket information in host NUMA cell(s).')
486 return []
488 # get a set of host sockets that the guest cells are in. Since guest
489 # cell IDs map to host cell IDs, we can just lookup the latter's
490 # socket.
491 socket_ids = set()
492 for guest_cell in numa_cells:
493 for host_cell in host_cells:
494 if guest_cell.id == host_cell.id: 494 ↛ 493line 494 didn't jump to line 493 because the condition on line 494 was always true
495 socket_ids.add(host_cell.socket)
497 # now get a set of host NUMA nodes that are in the above sockets
498 allowed_numa_nodes = set()
499 for host_cell in host_cells:
500 if host_cell.socket in socket_ids: 500 ↛ 499line 500 didn't jump to line 499 because the condition on line 500 was always true
501 allowed_numa_nodes.add(host_cell.id)
503 # filter out pools that are not in one of the correct host NUMA nodes.
504 return [
505 pool for pool in pools if any(
506 utils.pci_device_prop_match(pool, [{'numa_node': numa_node}])
507 for numa_node in allowed_numa_nodes
508 )
509 ]
511 def _filter_pools_for_unrequested_pfs(
512 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',
513 ) -> ty.List[Pool]:
514 """Filter out pools with PFs, unless these are required.
516 This is necessary in cases where PFs and VFs have the same product_id
517 and generally useful elsewhere.
519 :param pools: A list of PCI device pool dicts
520 :param request: An InstancePCIRequest object describing the type,
521 quantity and required NUMA affinity of device(s) we want.
522 :returns: A list of pools that can be used to support the request if
523 this is possible.
524 """
525 if all(
526 spec.get('dev_type') != fields.PciDeviceType.SRIOV_PF
527 for spec in request.spec
528 ):
529 pools = [
530 pool for pool in pools
531 if not pool.get('dev_type') == fields.PciDeviceType.SRIOV_PF
532 ]
533 return pools
535 def _filter_pools_for_unrequested_vdpa_devices(
536 self,
537 pools: ty.List[Pool],
538 request: 'objects.InstancePCIRequest',
539 ) -> ty.List[Pool]:
540 """Filter out pools with VDPA devices, unless these are required.
542 This is necessary as vdpa devices require special handling and
543 should not be allocated to generic pci device requests.
545 :param pools: A list of PCI device pool dicts
546 :param request: An InstancePCIRequest object describing the type,
547 quantity and required NUMA affinity of device(s) we want.
548 :returns: A list of pools that can be used to support the request if
549 this is possible.
550 """
551 if all(
552 spec.get('dev_type') != fields.PciDeviceType.VDPA
553 for spec in request.spec
554 ):
555 pools = [
556 pool for pool in pools
557 if not pool.get('dev_type') == fields.PciDeviceType.VDPA
558 ]
559 return pools
561 def _filter_pools_for_unrequested_remote_managed_devices(
562 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',
563 ) -> ty.List[Pool]:
564 """Filter out pools with remote_managed devices, unless requested.
566 Remote-managed devices are not usable for legacy SR-IOV or hardware
567 offload scenarios and must be excluded from allocation.
569 :param pools: A list of PCI device pool dicts
570 :param request: An InstancePCIRequest object describing the type,
571 quantity and required NUMA affinity of device(s) we want.
572 :returns: A list of pools that can be used to support the request if
573 this is possible.
574 """
575 if all(not strutils.bool_from_string(spec.get(PCI_REMOTE_MANAGED_TAG))
576 for spec in request.spec):
577 pools = [pool for pool in pools
578 if not strutils.bool_from_string(
579 pool.get(PCI_REMOTE_MANAGED_TAG))]
580 return pools
582 def _filter_pools_based_on_placement_allocation(
583 self,
584 pools: ty.List[Pool],
585 request: 'objects.InstancePCIRequest',
586 rp_uuids: ty.List[str],
587 ) -> ty.List[Pool]:
588 if not rp_uuids:
589 # If there is no placement allocation then we don't need to filter
590 # by it. This could happen if the instance only has neutron port
591 # based InstancePCIRequest as that is currently not having
592 # placement allocation (except for QoS ports, but that handled in a
593 # separate codepath) or if the [filter_scheduler]pci_in_placement
594 # configuration option is not enabled in the scheduler.
595 return pools
597 requested_dev_count_per_rp = collections.Counter(rp_uuids)
598 matching_pools = []
599 for pool in pools:
600 rp_uuid = pool.get('rp_uuid')
601 if rp_uuid is None: 601 ↛ 609line 601 didn't jump to line 609 because the condition on line 601 was never true
602 # NOTE(gibi): As rp_uuids is not empty the scheduler allocated
603 # PCI resources on this host, so we know that
604 # [pci]report_in_placement is enabled on this host. But this
605 # pool has no RP mapping which can only happen if the pool
606 # contains PCI devices with physical_network tag, as those
607 # devices not yet reported in placement. But if they are not
608 # reported then we can ignore them here too.
609 continue
611 if (
612 # the placement allocation contains this pool
613 rp_uuid in requested_dev_count_per_rp and
614 # the amount of dev allocated in placement can be consumed
615 # from the pool
616 pool["count"] >= requested_dev_count_per_rp[rp_uuid]
617 ):
618 matching_pools.append(pool)
620 return matching_pools
622 def _filter_pools_for_live_migratable_devices(
623 self, pools: ty.List[Pool], request: 'objects.InstancePCIRequest',
624 ) -> ty.List[Pool]:
625 """Filter out pools with non live_migratable devices.
627 :param pools: A list of PCI device pool dicts
628 :param request: An InstancePCIRequest object describing the type,
629 quantity and required NUMA affinity of device(s) we want.
630 :returns: A list of pools that can be used to support the request if
631 this is possible.
632 """
634 # The following code handles the case where 'live_migratable' is
635 # set (either "true" or "false") by filtering devices to select the
636 # appropriate ones.
637 # If it is not set, we skip the next code block and no filtering
638 # is applied to the pools.
639 if all(spec.get("live_migratable") == 'true' for spec in request.spec):
640 # if all specs require live migratable devices, then we need to
641 # reduce the pools by the ones that support them.
642 pools = [pool for pool in pools if pool.get("live_migratable") and
643 pool['live_migratable'] == 'true']
644 elif all(
645 spec.get("live_migratable") == "false" for spec in request.spec
646 ):
647 # If the request asks to NOT support live-migratable devices, then
648 # we don't provide the ones that support them.
649 # We want to exclude the devices that don't have this value yet.
650 pools = [pool for pool in pools if pool.get("live_migratable") and
651 pool['live_migratable'] == 'false']
652 return pools
654 def _filter_pools(
655 self,
656 pools: ty.List[Pool],
657 request: 'objects.InstancePCIRequest',
658 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']],
659 rp_uuids: ty.List[str],
660 ) -> ty.Optional[ty.List[Pool]]:
661 """Determine if an individual PCI request can be met.
663 Filter pools, which are collections of devices with similar traits, to
664 identify those that can support the provided PCI request.
666 If ``numa_cells`` is provided then NUMA locality may be taken into
667 account, depending on the value of ``request.numa_policy``.
669 :param pools: A list of PCI device pool dicts
670 :param request: An InstancePCIRequest object describing the type,
671 quantity and required NUMA affinity of device(s) we want.
672 :param numa_cells: A list of InstanceNUMACell objects whose ``id``
673 corresponds to the ``id`` of host NUMACell objects.
674 :param rp_uuids: A list of PR uuids this request fulfilled from in
675 placement. So here we have to consider only the pools matching with
676 these RP uuids
677 :returns: A list of pools that can be used to support the request if
678 this is possible, else None.
679 """
680 # NOTE(vladikr): This code may be open to race conditions.
681 # Two concurrent requests may succeed when called support_requests
682 # because this method does not remove related devices from the pools
684 # Firstly, let's exclude all devices that don't match our spec (e.g.
685 # they've got different PCI IDs or something)
686 before_count = sum([pool['count'] for pool in pools])
687 pools = self._filter_pools_for_spec(pools, request)
688 after_count = sum([pool['count'] for pool in pools])
690 if after_count < before_count:
691 LOG.debug(
692 'Dropped %d device(s) due to mismatched PCI attribute(s)',
693 before_count - after_count
694 )
696 if after_count < request.count:
697 LOG.debug('Not enough PCI devices left to satisfy request')
698 return None
700 # Next, let's exclude all devices that aren't on the correct NUMA node
701 # or socket, *assuming* we have devices and care about that, as
702 # determined by policy
703 before_count = after_count
704 pools = self._filter_pools_for_numa_cells(pools, request, numa_cells)
705 after_count = sum([pool['count'] for pool in pools])
707 if after_count < before_count:
708 LOG.debug(
709 'Dropped %d device(s) as they are on the wrong NUMA node(s)',
710 before_count - after_count
711 )
713 if after_count < request.count:
714 LOG.debug('Not enough PCI devices left to satisfy request')
715 return None
717 # If we're not requesting PFs then we should not use these.
718 # Exclude them.
719 before_count = after_count
720 pools = self._filter_pools_for_unrequested_pfs(pools, request)
721 after_count = sum([pool['count'] for pool in pools])
723 if after_count < before_count:
724 LOG.debug(
725 'Dropped %d device(s) as they are PFs which we have not '
726 'requested',
727 before_count - after_count
728 )
730 if after_count < request.count: 730 ↛ 731line 730 didn't jump to line 731 because the condition on line 730 was never true
731 LOG.debug('Not enough PCI devices left to satisfy request')
732 return None
734 # If we're not requesting VDPA devices then we should not use these
735 # either. Exclude them.
736 before_count = after_count
737 pools = self._filter_pools_for_unrequested_vdpa_devices(pools, request)
738 after_count = sum([pool['count'] for pool in pools])
740 if after_count < before_count:
741 LOG.debug(
742 'Dropped %d device(s) as they are VDPA devices which we have '
743 'not requested',
744 before_count - after_count
745 )
747 # If we're not requesting remote_managed devices then we should not
748 # use these either. Exclude them.
749 before_count = after_count
750 pools = self._filter_pools_for_unrequested_remote_managed_devices(
751 pools, request)
752 after_count = sum([pool['count'] for pool in pools])
754 if after_count < before_count: 754 ↛ 755line 754 didn't jump to line 755 because the condition on line 754 was never true
755 LOG.debug(
756 'Dropped %d device(s) as they are remote-managed devices which'
757 'we have not requested',
758 before_count - after_count
759 )
761 # if there is placement allocation for the request then we have to
762 # remove the pools that are not in the placement allocation
763 before_count = after_count
764 pools = self._filter_pools_based_on_placement_allocation(
765 pools, request, rp_uuids)
766 after_count = sum([pool['count'] for pool in pools])
767 if after_count < before_count:
768 LOG.debug(
769 'Dropped %d device(s) that are not part of the placement '
770 'allocation',
771 before_count - after_count
772 )
774 before_count = after_count
775 pools = self._filter_pools_for_live_migratable_devices(
776 pools, request)
777 after_count = sum([pool['count'] for pool in pools])
778 if after_count < before_count: 778 ↛ 779line 778 didn't jump to line 779 because the condition on line 778 was never true
779 LOG.debug(
780 'Dropped %d device(s) that are not live migratable',
781 before_count - after_count
782 )
784 if after_count < request.count:
785 LOG.debug('Not enough PCI devices left to satisfy request')
786 return None
788 return pools
790 def support_requests(
791 self,
792 requests: ty.List['objects.InstancePCIRequest'],
793 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
794 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
795 ) -> bool:
796 """Determine if the PCI requests can be met.
798 Determine, based on a compute node's PCI stats, if an instance can be
799 scheduled on the node. **Support does not mean real allocation**.
801 If ``numa_cells`` is provided then NUMA locality may be taken into
802 account, depending on the value of ``numa_policy``.
804 :param requests: A list of InstancePCIRequest object describing the
805 types, quantities and required NUMA affinities of devices we want.
806 :type requests: nova.objects.InstancePCIRequests
807 :param provider_mapping: A dict keyed by RequestGroup requester_id,
808 to a list of resource provider UUIDs which provide resource
809 for that RequestGroup. If it is None then it signals that the
810 InstancePCIRequest objects already stores a mapping per request.
811 I.e.: we are called _after_ the scheduler made allocations for this
812 request in placement.
813 :param numa_cells: A list of InstanceNUMACell objects whose ``id``
814 corresponds to the ``id`` of host NUMACells, or None.
815 :returns: Whether this compute node can satisfy the given request.
816 """
818 # try to apply the requests on the copy of the stats if it applies
819 # cleanly then we know that the requests is supported. We call apply
820 # only on a copy as we don't want to actually consume resources from
821 # the pool as at this point this is just a test during host filtering.
822 # Later the scheduler will call apply_request to consume on the
823 # selected host. The compute will call consume_request during PCI claim
824 # to consume not just from the pools but also consume PciDevice
825 # objects.
826 stats = copy.deepcopy(self)
827 try:
828 stats.apply_requests(requests, provider_mapping, numa_cells)
829 except exception.PciDeviceRequestFailed:
830 return False
832 return True
834 def _apply_request(
835 self,
836 pools: ty.List[Pool],
837 request: 'objects.InstancePCIRequest',
838 rp_uuids: ty.List[str],
839 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
840 ) -> bool:
841 """Apply an individual PCI request.
843 Apply a PCI request against a given set of PCI device pools, which are
844 collections of devices with similar traits.
846 If ``numa_cells`` is provided then NUMA locality may be taken into
847 account, depending on the value of ``request.numa_policy``.
849 :param pools: A list of PCI device pool dicts
850 :param request: An InstancePCIRequest object describing the type,
851 quantity and required NUMA affinity of device(s) we want.
852 :param rp_uuids: A list of PR uuids this request fulfilled from in
853 placement
854 :param numa_cells: A list of InstanceNUMACell objects whose ``id``
855 corresponds to the ``id`` of host NUMACell objects.
856 :returns: True if the request was applied against the provided pools
857 successfully, else False.
858 """
859 # NOTE(vladikr): This code maybe open to race conditions.
860 # Two concurrent requests may succeed when called support_requests
861 # because this method does not remove related devices from the pools
863 filtered_pools = self._filter_pools(
864 pools, request, numa_cells, rp_uuids)
866 if not filtered_pools:
867 return False
869 if not rp_uuids:
870 # If there is no placement allocation for this request then we are
871 # free to consume from the filtered pools in any order
872 count = request.count
873 for pool in filtered_pools: 873 ↛ 894line 873 didn't jump to line 894 because the loop on line 873 didn't complete
874 count = self._decrease_pool_count(pools, pool, count)
875 if not count:
876 break
877 else:
878 # but if there is placement allocation then we have to follow that
880 if not self._assert_one_pool_per_rp_uuid(pools):
881 return False
883 requested_devs_per_pool_rp = collections.Counter(rp_uuids)
884 for pool in filtered_pools:
885 count = requested_devs_per_pool_rp[pool['rp_uuid']]
886 pool['count'] -= count
887 # we consumed all the requested devices for the rp_uuid
888 # so we can drop that rp_uuid from the request.
889 requested_devs_per_pool_rp.pop(pool['rp_uuid'], None)
891 if pool['count'] == 0:
892 pools.remove(pool)
894 return True
896 def _get_rp_uuids_for_request(
897 self,
898 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
899 request: 'objects.InstancePCIRequest'
900 ) -> ty.List[str]:
901 """Return the list of RP uuids that are fulfilling the request.
903 An RP will be in the list as many times as many devices needs to
904 be allocated from that RP.
905 """
907 if request.source == objects.InstancePCIRequest.NEUTRON_PORT:
908 # TODO(gibi): support neutron based requests in a later cycle
909 # an empty list will signal that any PCI pool can be used for this
910 # request
911 return []
913 if not provider_mapping:
914 # NOTE(gibi): AFAIK specs is always a list of a single dict
915 # but the object is hard to change retroactively
916 rp_uuids = request.spec[0].get('rp_uuids')
917 if not rp_uuids:
918 # This can happen if [filter_scheduler]pci_in_placement is not
919 # enabled yet
920 # An empty list will signal that any PCI pool can be used for
921 # this request
922 return []
924 # TODO(gibi): this is baaad but spec is a dict of string so
925 # the list is serialized
926 return rp_uuids.split(',')
928 # NOTE(gibi): the PCI prefilter generates RequestGroup suffixes from
929 # InstancePCIRequests in the form of {request_id}-{count_index}
930 # NOTE(gibi): a suffixed request group always fulfilled from a single
931 # RP
932 return [
933 rp_uuids[0]
934 for group_id, rp_uuids in provider_mapping.items()
935 if group_id.startswith(request.request_id)
936 ]
938 def apply_requests(
939 self,
940 requests: ty.List['objects.InstancePCIRequest'],
941 provider_mapping: ty.Optional[ty.Dict[str, ty.List[str]]],
942 numa_cells: ty.Optional[ty.List['objects.InstanceNUMACell']] = None,
943 ) -> None:
944 """Apply PCI requests to the PCI stats.
946 This is used in multiple instance creation, when the scheduler has to
947 maintain how the resources are consumed by the instances.
949 If ``numa_cells`` is provided then NUMA locality may be taken into
950 account, depending on the value of ``numa_policy``.
952 :param requests: A list of InstancePCIRequest object describing the
953 types, quantities and required NUMA affinities of devices we want.
954 :type requests: nova.objects.InstancePCIRequests
955 :param provider_mapping: A dict keyed by RequestGroup requester_id,
956 to a list of resource provider UUIDs which provide resource
957 for that RequestGroup. If it is None then it signals that the
958 InstancePCIRequest objects already stores a mapping per request.
959 I.e.: we are called _after_ the scheduler made allocations for this
960 request in placement.
961 :param numa_cells: A list of InstanceNUMACell objects whose ``id``
962 corresponds to the ``id`` of host NUMACells, or None.
963 :raises: exception.PciDeviceRequestFailed if this compute node cannot
964 satisfy the given request.
965 """
967 for r in requests:
968 rp_uuids = self._get_rp_uuids_for_request(provider_mapping, r)
970 if not self._apply_request(self.pools, r, rp_uuids, numa_cells):
971 raise exception.PciDeviceRequestFailed(requests=requests)
973 def __iter__(self) -> ty.Iterator[Pool]:
974 pools: ty.List[Pool] = []
975 for pool in self.pools:
976 pool = copy.deepcopy(pool)
977 # 'devices' shouldn't be part of stats
978 if 'devices' in pool:
979 del pool['devices']
980 pools.append(pool)
981 return iter(pools)
983 def clear(self) -> None:
984 """Clear all the stats maintained."""
985 self.pools = []
987 def __eq__(self, other: object) -> bool:
988 if not isinstance(other, PciDeviceStats): 988 ↛ 989line 988 didn't jump to line 989 because the condition on line 988 was never true
989 return NotImplemented
990 return self.pools == other.pools
992 def to_device_pools_obj(self) -> 'objects.PciDevicePoolList':
993 """Return the contents of the pools as a PciDevicePoolList object."""
994 stats = [x for x in self]
995 return pci_device_pool.from_pci_stats(stats)
997 def has_remote_managed_device_pools(self) -> bool:
998 """Determine whether remote managed device pools are present on a host.
1000 The check is pool-based, not free device-based and is NUMA cell
1001 agnostic.
1002 """
1003 dummy_req = objects.InstancePCIRequest(
1004 count=0,
1005 spec=[{'remote_managed': True}]
1006 )
1007 pools = self._filter_pools_for_spec(self.pools, dummy_req)
1008 return bool(pools)
1010 def populate_pools_metadata_from_assigned_devices(self):
1011 """Populate the rp_uuid of each pool based on the rp_uuid of the
1012 devices assigned to the pool. This can only be called from the compute
1013 where devices are assigned to each pool. This should not be called from
1014 the scheduler as there device - pool assignment is not known.
1015 """
1016 # PciDevices are tracked in placement and flavor based PCI requests
1017 # are scheduled and allocated in placement. To be able to correlate
1018 # what is allocated in placement and what is consumed in nova we
1019 # need to map device pools to RPs. We can do that as the PciDevice
1020 # contains the RP UUID that represents it in placement.
1021 # NOTE(gibi): We cannot do this when the device is originally added to
1022 # the pool as the device -> placement translation, that creates the
1023 # RPs, runs after all the device is created and assigned to pools.
1024 for pool in self.pools:
1025 pool_rps = {
1026 dev.extra_info.get("rp_uuid")
1027 for dev in pool["devices"]
1028 if "rp_uuid" in dev.extra_info
1029 }
1030 if len(pool_rps) >= 2:
1031 # FIXME(gibi): Do we have a 1:1 pool - RP mapping even
1032 # if two PFs providing very similar VFs?
1033 raise ValueError(
1034 "We have a pool %s connected to more than one RPs %s in "
1035 "placement via devs %s" % (pool, pool_rps, pool["devices"])
1036 )
1038 if not pool_rps:
1039 # this can happen if the nova-compute is upgraded to have the
1040 # PCI in placement inventory handling code but
1041 # [pci]report_in_placement is not turned on yet.
1042 continue
1044 if pool_rps: # now we know that it is a single RP 1044 ↛ 1024line 1044 didn't jump to line 1024 because the condition on line 1044 was always true
1045 pool['rp_uuid'] = next(iter(pool_rps))
1047 @staticmethod
1048 def _assert_one_pool_per_rp_uuid(pools: ty.List[Pool]) -> bool:
1049 """Asserts that each pool has a unique rp_uuid if any
1051 :param pools: A list of Pool objects.
1052 :return: True if each pool has a unique rp_uuid or no rp_uuid assigned,
1053 False otherwise.
1054 """
1055 pools_per_rp_uuid = collections.defaultdict(list)
1056 for pool in pools:
1057 if "rp_uuid" in pool:
1058 pools_per_rp_uuid[pool["rp_uuid"]].append(pool)
1060 split_rp_uuids = {
1061 rp_uuid: pools
1062 for rp_uuid, pools in pools_per_rp_uuid.items()
1063 if len(pools) > 1}
1065 if split_rp_uuids:
1066 LOG.warning(
1067 "The PCI allocation logic assumes that devices "
1068 "related to the same rp_uuid are in the same pool. "
1069 "However the following rp_uuids are split across multiple "
1070 "pools. This should not happen. Please file a bug report. %s",
1071 split_rp_uuids
1072 )
1073 return not split_rp_uuids