Skip to content

Commit 4edcfe6

Browse files
Add patch for runtime PM and PMEs
1 parent ece7129 commit 4edcfe6

File tree

2 files changed

+283
-0
lines changed

2 files changed

+283
-0
lines changed

kernel.spec.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Patch32: 0001-Revert-e1000e-change-k1-configuration-on-MTP-and-lat.patch
151151
Patch61: xen-events-Add-wakeup-support-to-xen-pirq.patch
152152
Patch62: xen-pm-use-suspend.patch
153153
Patch63: xen-pciback-pm-suspend.patch
154+
Patch64: xen-pciback-pm-runtime.patch
154155

155156
%description
156157
Qubes Dom0 kernel.

xen-pciback-pm-runtime.patch

Lines changed: 282 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,282 @@
1+
From baccaee09ccdc43f02a1ed551d6d4376fb33b9e4 Mon Sep 17 00:00:00 2001
2+
From: Vertex X7-53 <[email protected]>
3+
Date: Sun, 17 Aug 2025 01:34:00 +0100
4+
Subject: [PATCH] xen/pciback: Pass through PME events and allow guest control
5+
6+
An important part of s0ix power management is the control of PCI device D-states.
7+
Without both the device and any applicable PCI bridges in D3cold, the PMC will
8+
keep power applied to the bus, and in most cases this will prevent the CPU from reaching states lower than Package C2.
9+
10+
The vast majority of devices depend on PME (Power Management Events) to
11+
wake from D3cold, so Linux will not attempt to put them into deeper
12+
sleep states if it detects the device does not support PME.
13+
PMEs can be delivered a variety of different ways, which include interrupts
14+
on the pcieport, ACPI events, and the setting of the PME status register in
15+
the PCI configuration space. Up until now, Xen has not supported the
16+
passthrough of PMEs to domains, and masks the relevant PME bits in the configuration space.
17+
18+
This first patch is a modification to the dom0 kernel, specifically pciback.
19+
We enable support for runtime PM in pciback, to allow the dom0 kernel
20+
to suspend upstream bridges. Then we allow domains to read PME capability registers.
21+
When dom0 receives a PME, it forwards this to pciback, and pciback then sets
22+
a special emulated flag on the device. This flag is cleared by the guest when it
23+
resets the register to 0, after handling the event. We also respond to requests
24+
from the guest to change the power state and place pciback in a PM state
25+
in dom0 depending on this, in order for dom0 to opportunistically suspend place any upstream pciports.
26+
27+
As dom0-controlled suspend (via qubes_exp_pm_suspend) is incompatible
28+
with this behaviour, any devices with this flag set will have runtime
29+
power management disabled and revert to the old behaviour where the
30+
guest does not interact with the Dom0 runtime power management stack.
31+
---
32+
.../xen/xen-pciback/conf_space_capability.c | 100 +++++++++++-------
33+
drivers/xen/xen-pciback/pci_stub.c | 30 ++++++
34+
drivers/xen/xen-pciback/pciback.h | 1 +
35+
3 files changed, 91 insertions(+), 40 deletions(-)
36+
37+
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
38+
index cf568e899ee2..ca0eb6b81907 100644
39+
--- a/drivers/xen/xen-pciback/conf_space_capability.c
40+
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
41+
@@ -8,8 +8,11 @@
42+
43+
#include <linux/kernel.h>
44+
#include <linux/pci.h>
45+
+#include <linux/pm.h>
46+
+#include <linux/pm_runtime.h>
47+
#include "pciback.h"
48+
#include "conf_space.h"
49+
+#include "../../pci/pci.h"
50+
51+
static LIST_HEAD(capabilities);
52+
struct xen_pcibk_config_capability {
53+
@@ -91,39 +94,84 @@ static const struct config_field caplist_vpd[] = {
54+
{}
55+
};
56+
57+
-static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
58+
+static int pm_ctrl_read(struct pci_dev *dev, int offset, u16 *value,
59+
void *data)
60+
{
61+
int err;
62+
u16 real_value;
63+
64+
- err = pci_read_config_word(dev, offset, &real_value);
65+
- if (err)
66+
- goto out;
67+
+ /* Driver domains have no ability to wake devices from D3cold on their own, as they have no access to ACPI.
68+
+ * As a substitute, we fake D3hot to the guest so the register read succeeds. When the guest sends us a wakeup command,
69+
+ * we'll carry out the necessary steps to wake the device from D3cold using runtime PM functions.
70+
+ */
71+
+ if (dev->current_state == PCI_D3cold) {
72+
+ /* No soft reset needed by the guest, because the host side will perform one on transition out of D3cold. */
73+
+ real_value = PCI_D3hot | PCI_PM_CTRL_NO_SOFT_RESET;
74+
+ } else {
75+
+ err = pci_read_config_word(dev, offset, &real_value);
76+
+ if (err)
77+
+ goto out;
78+
+ }
79+
80+
- *value = real_value & ~PCI_PM_CAP_PME_MASK;
81+
+ const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
82+
+ if (dev_data->pme_status)
83+
+ real_value |= (PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE);
84+
+
85+
+ *value = real_value;
86+
87+
out:
88+
return err;
89+
}
90+
91+
-/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
92+
- * Can't allow driver domain to enable PMEs - they're shared */
93+
-#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
94+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. */
95+
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_PME_ENABLE|PCI_PM_CTRL_DATA_SEL_MASK)
96+
97+
static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
98+
void *data)
99+
{
100+
int err;
101+
+ int pm_err;
102+
u16 old_value;
103+
pci_power_t new_state;
104+
105+
+ /* PME status is RW1CS */
106+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
107+
+ if (new_value & PCI_PM_CTRL_PME_STATUS) {
108+
+ dev_data->pme_status = 0;
109+
+ }
110+
+
111+
+ new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
112+
+
113+
+ /* First, use pm ops to transition state */
114+
+ dev_dbg(&dev->dev, "transitioning power state from %x to %x\n", dev->current_state, new_state);
115+
+
116+
+ pm_runtime_barrier(&dev->dev);
117+
+ bool runtime_pm = pm_runtime_enabled(&dev->dev);
118+
+ if (runtime_pm) {
119+
+ if (dev->dev.power.runtime_status == RPM_SUSPENDED && new_state < PCI_D3hot) {
120+
+ pm_err = pm_runtime_resume(&dev->dev);
121+
+ if (pm_err) dev_err(&dev->dev, "failed to resume device: %d\n", pm_err);
122+
+ } else if (dev->dev.power.runtime_status == RPM_ACTIVE && new_state >= PCI_D3hot) {
123+
+ pm_err = pm_runtime_suspend(&dev->dev);
124+
+ if (pm_err) dev_err(&dev->dev, "failed to suspend device: %d\n", pm_err);
125+
+ }
126+
+ }
127+
+
128+
+ /* Otherwise, set it manually */
129+
+ if (!runtime_pm || dev->current_state != new_state) {
130+
+ err = pci_set_power_state(dev, new_state);
131+
+ if (err) {
132+
+ dev_err(&dev->dev, "failed to manually set pci power state to %x: %d\n", new_state, err);
133+
+ err = PCIBIOS_SET_FAILED;
134+
+ goto out;
135+
+ }
136+
+ }
137+
+
138+
+ /* This must happen here, after pm_runtime_resume is called */
139+
err = pci_read_config_word(dev, offset, &old_value);
140+
if (err)
141+
goto out;
142+
143+
- new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
144+
-
145+
new_value &= PM_OK_BITS;
146+
if ((old_value & PM_OK_BITS) != new_value) {
147+
new_value = (old_value & ~PM_OK_BITS) | new_value;
148+
@@ -132,48 +180,20 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
149+
goto out;
150+
}
151+
152+
- /* Let pci core handle the power management change */
153+
- dev_dbg(&dev->dev, "set power state to %x\n", new_state);
154+
- err = pci_set_power_state(dev, new_state);
155+
- if (err) {
156+
- err = PCIBIOS_SET_FAILED;
157+
- goto out;
158+
- }
159+
-
160+
out:
161+
return err;
162+
}
163+
164+
-/* Ensure PMEs are disabled */
165+
-static void *pm_ctrl_init(struct pci_dev *dev, int offset)
166+
-{
167+
- int err;
168+
- u16 value;
169+
-
170+
- err = pci_read_config_word(dev, offset, &value);
171+
- if (err)
172+
- goto out;
173+
-
174+
- if (value & PCI_PM_CTRL_PME_ENABLE) {
175+
- value &= ~PCI_PM_CTRL_PME_ENABLE;
176+
- err = pci_write_config_word(dev, offset, value);
177+
- }
178+
-
179+
-out:
180+
- return err ? ERR_PTR(err) : NULL;
181+
-}
182+
-
183+
static const struct config_field caplist_pm[] = {
184+
{
185+
.offset = PCI_PM_PMC,
186+
.size = 2,
187+
- .u.w.read = pm_caps_read,
188+
+ .u.w.read = xen_pcibk_read_config_word,
189+
},
190+
{
191+
.offset = PCI_PM_CTRL,
192+
.size = 2,
193+
- .init = pm_ctrl_init,
194+
- .u.w.read = xen_pcibk_read_config_word,
195+
+ .u.w.read = pm_ctrl_read,
196+
.u.w.write = pm_ctrl_write,
197+
},
198+
{
199+
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
200+
index 073b259747e9..461704454546 100644
201+
--- a/drivers/xen/xen-pciback/pci_stub.c
202+
+++ b/drivers/xen/xen-pciback/pci_stub.c
203+
@@ -18,6 +18,8 @@
204+
#include <linux/wait.h>
205+
#include <linux/sched.h>
206+
#include <linux/atomic.h>
207+
+#include <linux/pm.h>
208+
+#include <linux/pm_runtime.h>
209+
#include <xen/events.h>
210+
#include <xen/pci.h>
211+
#include <xen/xen.h>
212+
@@ -153,6 +155,7 @@ static void pcistub_device_release(struct kref *kref)
213+
214+
kfree(dev_data);
215+
pci_set_drvdata(dev, NULL);
216+
+ pm_runtime_get_noresume(&dev->dev);
217+
218+
/* Clean-up the device */
219+
xen_pcibk_config_free_dyn_fields(dev);
220+
@@ -494,6 +497,8 @@ static int pcistub_init_device(struct pcistub_device *psdev)
221+
xen_pcibk_reset_device(dev);
222+
223+
pci_set_dev_assigned(dev);
224+
+ pm_runtime_put_noidle(&dev->dev);
225+
+
226+
return 0;
227+
228+
config_release:
229+
@@ -1073,6 +1078,29 @@ static int xen_pcibk_suspend_noirq(struct device *dev) {
230+
return 0;
231+
}
232+
233+
+/* Since this is a virtual representation of the PM state, we only allow the device
234+
+ * to enter the "suspended" state after the guest commands the device into D3hot
235+
+*/
236+
+static int xen_pcibk_pm_idle(struct device *dev)
237+
+{
238+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
239+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
240+
+ if (dev_data->pme_status) return -EBUSY;
241+
+
242+
+ return pci_dev->current_state >= PCI_D3hot ? 0 : -EBUSY;
243+
+}
244+
+
245+
+static int xen_pcibk_pm_resume(struct device *dev)
246+
+{
247+
+ /* PME bit is always asserted on wakeup, regardless of whether the device supports it or not
248+
+ * This is a non-issue, since guest kernel logic will just wake up the device if it isn't already awake */
249+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
250+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
251+
+ dev_data->pme_status = 1;
252+
+
253+
+ return 0;
254+
+}
255+
+
256+
/*add xen_pcibk AER handling*/
257+
static const struct pci_error_handlers xen_pcibk_error_handler = {
258+
.error_detected = xen_pcibk_error_detected,
259+
@@ -1083,6 +1111,8 @@ static const struct pci_error_handlers xen_pcibk_error_handler = {
260+
261+
static const struct dev_pm_ops xen_pcibk_pm_ops = {
262+
.suspend_noirq = xen_pcibk_suspend_noirq,
263+
+ .runtime_idle = xen_pcibk_pm_idle,
264+
+ .runtime_resume = xen_pcibk_pm_resume,
265+
};
266+
267+
/*
268+
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
269+
index cf6df6964664..52774972cc3c 100644
270+
--- a/drivers/xen/xen-pciback/pciback.h
271+
+++ b/drivers/xen/xen-pciback/pciback.h
272+
@@ -56,6 +56,7 @@ struct xen_pcibk_dev_data {
273+
unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
274+
unsigned int ack_intr:1; /* .. and ACK-ing */
275+
unsigned long handled;
276+
+ unsigned int pme_status:1;
277+
unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
278+
char irq_name[]; /* xen-pcibk[000:04:00.0] */
279+
};
280+
--
281+
2.49.0
282+

0 commit comments

Comments
 (0)