|
| 1 | +From baccaee09ccdc43f02a1ed551d6d4376fb33b9e4 Mon Sep 17 00:00:00 2001 |
| 2 | +From: Vertex X7-53 < [email protected]> |
| 3 | +Date: Sun, 17 Aug 2025 01:34:00 +0100 |
| 4 | +Subject: [PATCH] xen/pciback: Pass through PME events and allow guest control |
| 5 | + |
| 6 | +An important part of s0ix power management is the control of PCI device D-states. |
| 7 | +Without both the device and any applicable PCI bridges in D3cold, the PMC will |
| 8 | +keep power applied to the bus, and in most cases this will prevent the CPU from reaching states lower than Package C2. |
| 9 | + |
| 10 | +The vast majority of devices depend on PME (Power Management Events) to |
| 11 | +wake from D3cold, so Linux will not attempt to put them into deeper |
| 12 | +sleep states if it detects the device does not support PME. |
| 13 | +PMEs can be delivered a variety of different ways, which include interrupts |
| 14 | +on the pcieport, ACPI events, and the setting of the PME status register in |
| 15 | +the PCI configuration space. Up until now, Xen has not supported the |
| 16 | +passthrough of PMEs to domains, and masks the relevant PME bits in the configuration space. |
| 17 | + |
| 18 | +This first patch is a modification to the dom0 kernel, specifically pciback. |
| 19 | +We enable support for runtime PM in pciback, to allow the dom0 kernel |
| 20 | +to suspend upstream bridges. Then we allow domains to read PME capability registers. |
| 21 | +When dom0 receives a PME, it forwards this to pciback, and pciback then sets |
| 22 | +a special emulated flag on the device. This flag is cleared by the guest when it |
| 23 | +resets the register to 0, after handling the event. We also respond to requests |
| 24 | +from the guest to change the power state and place pciback in a PM state |
| 25 | +in dom0 depending on this, in order for dom0 to opportunistically suspend place any upstream pciports. |
| 26 | +--- |
| 27 | + .../xen/xen-pciback/conf_space_capability.c | 100 +++++++++++------- |
| 28 | + drivers/xen/xen-pciback/pci_stub.c | 30 ++++++ |
| 29 | + drivers/xen/xen-pciback/pciback.h | 1 + |
| 30 | + 3 files changed, 91 insertions(+), 40 deletions(-) |
| 31 | + |
| 32 | +diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c |
| 33 | +index cf568e899ee2..ca0eb6b81907 100644 |
| 34 | +--- a/drivers/xen/xen-pciback/conf_space_capability.c |
| 35 | ++++ b/drivers/xen/xen-pciback/conf_space_capability.c |
| 36 | +@@ -8,8 +8,11 @@ |
| 37 | + |
| 38 | + #include <linux/kernel.h> |
| 39 | + #include <linux/pci.h> |
| 40 | ++#include <linux/pm.h> |
| 41 | ++#include <linux/pm_runtime.h> |
| 42 | + #include "pciback.h" |
| 43 | + #include "conf_space.h" |
| 44 | ++#include "../../pci/pci.h" |
| 45 | + |
| 46 | + static LIST_HEAD(capabilities); |
| 47 | + struct xen_pcibk_config_capability { |
| 48 | +@@ -91,39 +94,84 @@ static const struct config_field caplist_vpd[] = { |
| 49 | + {} |
| 50 | + }; |
| 51 | + |
| 52 | +-static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value, |
| 53 | ++static int pm_ctrl_read(struct pci_dev *dev, int offset, u16 *value, |
| 54 | + void *data) |
| 55 | + { |
| 56 | + int err; |
| 57 | + u16 real_value; |
| 58 | + |
| 59 | +- err = pci_read_config_word(dev, offset, &real_value); |
| 60 | +- if (err) |
| 61 | +- goto out; |
| 62 | ++ /* Driver domains have no ability to wake devices from D3cold on their own, as they have no access to ACPI. |
| 63 | ++ * As a substitute, we fake D3hot to the guest so the register read succeeds. When the guest sends us a wakeup command, |
| 64 | ++ * we'll carry out the necessary steps to wake the device from D3cold using runtime PM functions. |
| 65 | ++ */ |
| 66 | ++ if (dev->current_state == PCI_D3cold) { |
| 67 | ++ /* No soft reset needed by the guest, because the host side will perform one on transition out of D3cold. */ |
| 68 | ++ real_value = PCI_D3hot | PCI_PM_CTRL_NO_SOFT_RESET; |
| 69 | ++ } else { |
| 70 | ++ err = pci_read_config_word(dev, offset, &real_value); |
| 71 | ++ if (err) |
| 72 | ++ goto out; |
| 73 | ++ } |
| 74 | + |
| 75 | +- *value = real_value & ~PCI_PM_CAP_PME_MASK; |
| 76 | ++ const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); |
| 77 | ++ if (dev_data->pme_status) |
| 78 | ++ real_value |= (PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE); |
| 79 | ++ |
| 80 | ++ *value = real_value; |
| 81 | + |
| 82 | + out: |
| 83 | + return err; |
| 84 | + } |
| 85 | + |
| 86 | +-/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. |
| 87 | +- * Can't allow driver domain to enable PMEs - they're shared */ |
| 88 | +-#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK) |
| 89 | ++/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. */ |
| 90 | ++#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_PME_ENABLE|PCI_PM_CTRL_DATA_SEL_MASK) |
| 91 | + |
| 92 | + static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, |
| 93 | + void *data) |
| 94 | + { |
| 95 | + int err; |
| 96 | ++ int pm_err; |
| 97 | + u16 old_value; |
| 98 | + pci_power_t new_state; |
| 99 | + |
| 100 | ++ /* PME status is RW1CS */ |
| 101 | ++ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev); |
| 102 | ++ if (new_value & PCI_PM_CTRL_PME_STATUS) { |
| 103 | ++ dev_data->pme_status = 0; |
| 104 | ++ } |
| 105 | ++ |
| 106 | ++ new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); |
| 107 | ++ |
| 108 | ++ /* First, use pm ops to transition state */ |
| 109 | ++ dev_dbg(&dev->dev, "transitioning power state from %x to %x\n", dev->current_state, new_state); |
| 110 | ++ |
| 111 | ++ pm_runtime_barrier(&dev->dev); |
| 112 | ++ bool runtime_pm = pm_runtime_enabled(&dev->dev); |
| 113 | ++ if (runtime_pm) { |
| 114 | ++ if (dev->dev.power.runtime_status == RPM_SUSPENDED && new_state < PCI_D3hot) { |
| 115 | ++ pm_err = pm_runtime_resume(&dev->dev); |
| 116 | ++ if (pm_err) dev_err(&dev->dev, "failed to resume device: %d\n", pm_err); |
| 117 | ++ } else if (dev->dev.power.runtime_status == RPM_ACTIVE && new_state >= PCI_D3hot) { |
| 118 | ++ pm_err = pm_runtime_suspend(&dev->dev); |
| 119 | ++ if (pm_err) dev_err(&dev->dev, "failed to suspend device: %d\n", pm_err); |
| 120 | ++ } |
| 121 | ++ } |
| 122 | ++ |
| 123 | ++ /* Otherwise, set it manually */ |
| 124 | ++ if (!runtime_pm || dev->current_state != new_state) { |
| 125 | ++ err = pci_set_power_state(dev, new_state); |
| 126 | ++ if (err) { |
| 127 | ++ dev_err(&dev->dev, "failed to manually set pci power state to %x: %d\n", new_state, err); |
| 128 | ++ err = PCIBIOS_SET_FAILED; |
| 129 | ++ goto out; |
| 130 | ++ } |
| 131 | ++ } |
| 132 | ++ |
| 133 | ++ /* This must happen here, after pm_runtime_resume is called */ |
| 134 | + err = pci_read_config_word(dev, offset, &old_value); |
| 135 | + if (err) |
| 136 | + goto out; |
| 137 | + |
| 138 | +- new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK); |
| 139 | +- |
| 140 | + new_value &= PM_OK_BITS; |
| 141 | + if ((old_value & PM_OK_BITS) != new_value) { |
| 142 | + new_value = (old_value & ~PM_OK_BITS) | new_value; |
| 143 | +@@ -132,48 +180,20 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value, |
| 144 | + goto out; |
| 145 | + } |
| 146 | + |
| 147 | +- /* Let pci core handle the power management change */ |
| 148 | +- dev_dbg(&dev->dev, "set power state to %x\n", new_state); |
| 149 | +- err = pci_set_power_state(dev, new_state); |
| 150 | +- if (err) { |
| 151 | +- err = PCIBIOS_SET_FAILED; |
| 152 | +- goto out; |
| 153 | +- } |
| 154 | +- |
| 155 | + out: |
| 156 | + return err; |
| 157 | + } |
| 158 | + |
| 159 | +-/* Ensure PMEs are disabled */ |
| 160 | +-static void *pm_ctrl_init(struct pci_dev *dev, int offset) |
| 161 | +-{ |
| 162 | +- int err; |
| 163 | +- u16 value; |
| 164 | +- |
| 165 | +- err = pci_read_config_word(dev, offset, &value); |
| 166 | +- if (err) |
| 167 | +- goto out; |
| 168 | +- |
| 169 | +- if (value & PCI_PM_CTRL_PME_ENABLE) { |
| 170 | +- value &= ~PCI_PM_CTRL_PME_ENABLE; |
| 171 | +- err = pci_write_config_word(dev, offset, value); |
| 172 | +- } |
| 173 | +- |
| 174 | +-out: |
| 175 | +- return err ? ERR_PTR(err) : NULL; |
| 176 | +-} |
| 177 | +- |
| 178 | + static const struct config_field caplist_pm[] = { |
| 179 | + { |
| 180 | + .offset = PCI_PM_PMC, |
| 181 | + .size = 2, |
| 182 | +- .u.w.read = pm_caps_read, |
| 183 | ++ .u.w.read = xen_pcibk_read_config_word, |
| 184 | + }, |
| 185 | + { |
| 186 | + .offset = PCI_PM_CTRL, |
| 187 | + .size = 2, |
| 188 | +- .init = pm_ctrl_init, |
| 189 | +- .u.w.read = xen_pcibk_read_config_word, |
| 190 | ++ .u.w.read = pm_ctrl_read, |
| 191 | + .u.w.write = pm_ctrl_write, |
| 192 | + }, |
| 193 | + { |
| 194 | +diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c |
| 195 | +index 073b259747e9..461704454546 100644 |
| 196 | +--- a/drivers/xen/xen-pciback/pci_stub.c |
| 197 | ++++ b/drivers/xen/xen-pciback/pci_stub.c |
| 198 | +@@ -18,6 +18,8 @@ |
| 199 | + #include <linux/wait.h> |
| 200 | + #include <linux/sched.h> |
| 201 | + #include <linux/atomic.h> |
| 202 | ++#include <linux/pm.h> |
| 203 | ++#include <linux/pm_runtime.h> |
| 204 | + #include <xen/events.h> |
| 205 | + #include <xen/pci.h> |
| 206 | + #include <xen/xen.h> |
| 207 | +@@ -153,6 +155,7 @@ static void pcistub_device_release(struct kref *kref) |
| 208 | + |
| 209 | + kfree(dev_data); |
| 210 | + pci_set_drvdata(dev, NULL); |
| 211 | ++ pm_runtime_get_noresume(&dev->dev); |
| 212 | + |
| 213 | + /* Clean-up the device */ |
| 214 | + xen_pcibk_config_free_dyn_fields(dev); |
| 215 | +@@ -494,6 +497,8 @@ static int pcistub_init_device(struct pcistub_device *psdev) |
| 216 | + xen_pcibk_reset_device(dev); |
| 217 | + |
| 218 | + pci_set_dev_assigned(dev); |
| 219 | ++ pm_runtime_put_noidle(&dev->dev); |
| 220 | ++ |
| 221 | + return 0; |
| 222 | + |
| 223 | + config_release: |
| 224 | +@@ -1073,6 +1078,29 @@ static int xen_pcibk_suspend_noirq(struct device *dev) { |
| 225 | + return 0; |
| 226 | + } |
| 227 | + |
| 228 | ++/* Since this is a virtual representation of the PM state, we only allow the device |
| 229 | ++ * to enter the "suspended" state after the guest commands the device into D3hot |
| 230 | ++*/ |
| 231 | ++static int xen_pcibk_pm_idle(struct device *dev) |
| 232 | ++{ |
| 233 | ++ struct pci_dev *pci_dev = to_pci_dev(dev); |
| 234 | ++ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev); |
| 235 | ++ if (dev_data->pme_status) return -EBUSY; |
| 236 | ++ |
| 237 | ++ return pci_dev->current_state >= PCI_D3hot ? 0 : -EBUSY; |
| 238 | ++} |
| 239 | ++ |
| 240 | ++static int xen_pcibk_pm_resume(struct device *dev) |
| 241 | ++{ |
| 242 | ++ /* PME bit is always asserted on wakeup, regardless of whether the device supports it or not |
| 243 | ++ * This is a non-issue, since guest kernel logic will just wake up the device if it isn't already awake */ |
| 244 | ++ struct pci_dev *pci_dev = to_pci_dev(dev); |
| 245 | ++ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev); |
| 246 | ++ dev_data->pme_status = 1; |
| 247 | ++ |
| 248 | ++ return 0; |
| 249 | ++} |
| 250 | ++ |
| 251 | + /*add xen_pcibk AER handling*/ |
| 252 | + static const struct pci_error_handlers xen_pcibk_error_handler = { |
| 253 | + .error_detected = xen_pcibk_error_detected, |
| 254 | +@@ -1083,6 +1111,8 @@ static const struct pci_error_handlers xen_pcibk_error_handler = { |
| 255 | + |
| 256 | + static const struct dev_pm_ops xen_pcibk_pm_ops = { |
| 257 | + .suspend_noirq = xen_pcibk_suspend_noirq, |
| 258 | ++ .runtime_idle = xen_pcibk_pm_idle, |
| 259 | ++ .runtime_resume = xen_pcibk_pm_resume, |
| 260 | + }; |
| 261 | + |
| 262 | + /* |
| 263 | +diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h |
| 264 | +index cf6df6964664..52774972cc3c 100644 |
| 265 | +--- a/drivers/xen/xen-pciback/pciback.h |
| 266 | ++++ b/drivers/xen/xen-pciback/pciback.h |
| 267 | +@@ -56,6 +56,7 @@ struct xen_pcibk_dev_data { |
| 268 | + unsigned int isr_on:1; /* Whether the IRQ handler is installed. */ |
| 269 | + unsigned int ack_intr:1; /* .. and ACK-ing */ |
| 270 | + unsigned long handled; |
| 271 | ++ unsigned int pme_status:1; |
| 272 | + unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */ |
| 273 | + char irq_name[]; /* xen-pcibk[000:04:00.0] */ |
| 274 | + }; |
| 275 | +-- |
| 276 | +2.49.0 |
| 277 | + |
0 commit comments