Skip to content

Commit a3d309b

Browse files
Add patch for runtime PM and PMEs
1 parent ece7129 commit a3d309b

File tree

2 files changed

+278
-0
lines changed

2 files changed

+278
-0
lines changed

kernel.spec.in

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ Patch32: 0001-Revert-e1000e-change-k1-configuration-on-MTP-and-lat.patch
151151
Patch61: xen-events-Add-wakeup-support-to-xen-pirq.patch
152152
Patch62: xen-pm-use-suspend.patch
153153
Patch63: xen-pciback-pm-suspend.patch
154+
Patch64: xen-pciback-pm-runtime.patch
154155

155156
%description
156157
Qubes Dom0 kernel.

xen-pciback-pm-runtime.patch

Lines changed: 277 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,277 @@
1+
From baccaee09ccdc43f02a1ed551d6d4376fb33b9e4 Mon Sep 17 00:00:00 2001
2+
From: Vertex X7-53 <[email protected]>
3+
Date: Sun, 17 Aug 2025 01:34:00 +0100
4+
Subject: [PATCH] xen/pciback: Pass through PME events and allow guest control
5+
6+
An important part of s0ix power management is the control of PCI device D-states.
7+
Without both the device and any applicable PCI bridges in D3cold, the PMC will
8+
keep power applied to the bus, and in most cases this will prevent the CPU from reaching states lower than Package C2.
9+
10+
The vast majority of devices depend on PME (Power Management Events) to
11+
wake from D3cold, so Linux will not attempt to put them into deeper
12+
sleep states if it detects the device does not support PME.
13+
PMEs can be delivered a variety of different ways, which include interrupts
14+
on the pcieport, ACPI events, and the setting of the PME status register in
15+
the PCI configuration space. Up until now, Xen has not supported the
16+
passthrough of PMEs to domains, and masks the relevant PME bits in the configuration space.
17+
18+
This first patch is a modification to the dom0 kernel, specifically pciback.
19+
We enable support for runtime PM in pciback, to allow the dom0 kernel
20+
to suspend upstream bridges. Then we allow domains to read PME capability registers.
21+
When dom0 receives a PME, it forwards this to pciback, and pciback then sets
22+
a special emulated flag on the device. This flag is cleared by the guest when it
23+
resets the register to 0, after handling the event. We also respond to requests
24+
from the guest to change the power state and place pciback in a PM state
25+
in dom0 depending on this, in order for dom0 to opportunistically suspend place any upstream pciports.
26+
---
27+
.../xen/xen-pciback/conf_space_capability.c | 100 +++++++++++-------
28+
drivers/xen/xen-pciback/pci_stub.c | 30 ++++++
29+
drivers/xen/xen-pciback/pciback.h | 1 +
30+
3 files changed, 91 insertions(+), 40 deletions(-)
31+
32+
diff --git a/drivers/xen/xen-pciback/conf_space_capability.c b/drivers/xen/xen-pciback/conf_space_capability.c
33+
index cf568e899ee2..ca0eb6b81907 100644
34+
--- a/drivers/xen/xen-pciback/conf_space_capability.c
35+
+++ b/drivers/xen/xen-pciback/conf_space_capability.c
36+
@@ -8,8 +8,11 @@
37+
38+
#include <linux/kernel.h>
39+
#include <linux/pci.h>
40+
+#include <linux/pm.h>
41+
+#include <linux/pm_runtime.h>
42+
#include "pciback.h"
43+
#include "conf_space.h"
44+
+#include "../../pci/pci.h"
45+
46+
static LIST_HEAD(capabilities);
47+
struct xen_pcibk_config_capability {
48+
@@ -91,39 +94,84 @@ static const struct config_field caplist_vpd[] = {
49+
{}
50+
};
51+
52+
-static int pm_caps_read(struct pci_dev *dev, int offset, u16 *value,
53+
+static int pm_ctrl_read(struct pci_dev *dev, int offset, u16 *value,
54+
void *data)
55+
{
56+
int err;
57+
u16 real_value;
58+
59+
- err = pci_read_config_word(dev, offset, &real_value);
60+
- if (err)
61+
- goto out;
62+
+ /* Driver domains have no ability to wake devices from D3cold on their own, as they have no access to ACPI.
63+
+ * As a substitute, we fake D3hot to the guest so the register read succeeds. When the guest sends us a wakeup command,
64+
+ * we'll carry out the necessary steps to wake the device from D3cold using runtime PM functions.
65+
+ */
66+
+ if (dev->current_state == PCI_D3cold) {
67+
+ /* No soft reset needed by the guest, because the host side will perform one on transition out of D3cold. */
68+
+ real_value = PCI_D3hot | PCI_PM_CTRL_NO_SOFT_RESET;
69+
+ } else {
70+
+ err = pci_read_config_word(dev, offset, &real_value);
71+
+ if (err)
72+
+ goto out;
73+
+ }
74+
75+
- *value = real_value & ~PCI_PM_CAP_PME_MASK;
76+
+ const struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
77+
+ if (dev_data->pme_status)
78+
+ real_value |= (PCI_PM_CTRL_PME_STATUS | PCI_PM_CTRL_PME_ENABLE);
79+
+
80+
+ *value = real_value;
81+
82+
out:
83+
return err;
84+
}
85+
86+
-/* PM_OK_BITS specifies the bits that the driver domain is allowed to change.
87+
- * Can't allow driver domain to enable PMEs - they're shared */
88+
-#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_DATA_SEL_MASK)
89+
+/* PM_OK_BITS specifies the bits that the driver domain is allowed to change. */
90+
+#define PM_OK_BITS (PCI_PM_CTRL_PME_STATUS|PCI_PM_CTRL_PME_ENABLE|PCI_PM_CTRL_DATA_SEL_MASK)
91+
92+
static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
93+
void *data)
94+
{
95+
int err;
96+
+ int pm_err;
97+
u16 old_value;
98+
pci_power_t new_state;
99+
100+
+ /* PME status is RW1CS */
101+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(dev);
102+
+ if (new_value & PCI_PM_CTRL_PME_STATUS) {
103+
+ dev_data->pme_status = 0;
104+
+ }
105+
+
106+
+ new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
107+
+
108+
+ /* First, use pm ops to transition state */
109+
+ dev_dbg(&dev->dev, "transitioning power state from %x to %x\n", dev->current_state, new_state);
110+
+
111+
+ pm_runtime_barrier(&dev->dev);
112+
+ bool runtime_pm = pm_runtime_enabled(&dev->dev);
113+
+ if (runtime_pm) {
114+
+ if (dev->dev.power.runtime_status == RPM_SUSPENDED && new_state < PCI_D3hot) {
115+
+ pm_err = pm_runtime_resume(&dev->dev);
116+
+ if (pm_err) dev_err(&dev->dev, "failed to resume device: %d\n", pm_err);
117+
+ } else if (dev->dev.power.runtime_status == RPM_ACTIVE && new_state >= PCI_D3hot) {
118+
+ pm_err = pm_runtime_suspend(&dev->dev);
119+
+ if (pm_err) dev_err(&dev->dev, "failed to suspend device: %d\n", pm_err);
120+
+ }
121+
+ }
122+
+
123+
+ /* Otherwise, set it manually */
124+
+ if (!runtime_pm || dev->current_state != new_state) {
125+
+ err = pci_set_power_state(dev, new_state);
126+
+ if (err) {
127+
+ dev_err(&dev->dev, "failed to manually set pci power state to %x: %d\n", new_state, err);
128+
+ err = PCIBIOS_SET_FAILED;
129+
+ goto out;
130+
+ }
131+
+ }
132+
+
133+
+ /* This must happen here, after pm_runtime_resume is called */
134+
err = pci_read_config_word(dev, offset, &old_value);
135+
if (err)
136+
goto out;
137+
138+
- new_state = (__force pci_power_t)(new_value & PCI_PM_CTRL_STATE_MASK);
139+
-
140+
new_value &= PM_OK_BITS;
141+
if ((old_value & PM_OK_BITS) != new_value) {
142+
new_value = (old_value & ~PM_OK_BITS) | new_value;
143+
@@ -132,48 +180,20 @@ static int pm_ctrl_write(struct pci_dev *dev, int offset, u16 new_value,
144+
goto out;
145+
}
146+
147+
- /* Let pci core handle the power management change */
148+
- dev_dbg(&dev->dev, "set power state to %x\n", new_state);
149+
- err = pci_set_power_state(dev, new_state);
150+
- if (err) {
151+
- err = PCIBIOS_SET_FAILED;
152+
- goto out;
153+
- }
154+
-
155+
out:
156+
return err;
157+
}
158+
159+
-/* Ensure PMEs are disabled */
160+
-static void *pm_ctrl_init(struct pci_dev *dev, int offset)
161+
-{
162+
- int err;
163+
- u16 value;
164+
-
165+
- err = pci_read_config_word(dev, offset, &value);
166+
- if (err)
167+
- goto out;
168+
-
169+
- if (value & PCI_PM_CTRL_PME_ENABLE) {
170+
- value &= ~PCI_PM_CTRL_PME_ENABLE;
171+
- err = pci_write_config_word(dev, offset, value);
172+
- }
173+
-
174+
-out:
175+
- return err ? ERR_PTR(err) : NULL;
176+
-}
177+
-
178+
static const struct config_field caplist_pm[] = {
179+
{
180+
.offset = PCI_PM_PMC,
181+
.size = 2,
182+
- .u.w.read = pm_caps_read,
183+
+ .u.w.read = xen_pcibk_read_config_word,
184+
},
185+
{
186+
.offset = PCI_PM_CTRL,
187+
.size = 2,
188+
- .init = pm_ctrl_init,
189+
- .u.w.read = xen_pcibk_read_config_word,
190+
+ .u.w.read = pm_ctrl_read,
191+
.u.w.write = pm_ctrl_write,
192+
},
193+
{
194+
diff --git a/drivers/xen/xen-pciback/pci_stub.c b/drivers/xen/xen-pciback/pci_stub.c
195+
index 073b259747e9..461704454546 100644
196+
--- a/drivers/xen/xen-pciback/pci_stub.c
197+
+++ b/drivers/xen/xen-pciback/pci_stub.c
198+
@@ -18,6 +18,8 @@
199+
#include <linux/wait.h>
200+
#include <linux/sched.h>
201+
#include <linux/atomic.h>
202+
+#include <linux/pm.h>
203+
+#include <linux/pm_runtime.h>
204+
#include <xen/events.h>
205+
#include <xen/pci.h>
206+
#include <xen/xen.h>
207+
@@ -153,6 +155,7 @@ static void pcistub_device_release(struct kref *kref)
208+
209+
kfree(dev_data);
210+
pci_set_drvdata(dev, NULL);
211+
+ pm_runtime_get_noresume(&dev->dev);
212+
213+
/* Clean-up the device */
214+
xen_pcibk_config_free_dyn_fields(dev);
215+
@@ -494,6 +497,8 @@ static int pcistub_init_device(struct pcistub_device *psdev)
216+
xen_pcibk_reset_device(dev);
217+
218+
pci_set_dev_assigned(dev);
219+
+ pm_runtime_put_noidle(&dev->dev);
220+
+
221+
return 0;
222+
223+
config_release:
224+
@@ -1073,6 +1078,29 @@ static int xen_pcibk_suspend_noirq(struct device *dev) {
225+
return 0;
226+
}
227+
228+
+/* Since this is a virtual representation of the PM state, we only allow the device
229+
+ * to enter the "suspended" state after the guest commands the device into D3hot
230+
+*/
231+
+static int xen_pcibk_pm_idle(struct device *dev)
232+
+{
233+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
234+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
235+
+ if (dev_data->pme_status) return -EBUSY;
236+
+
237+
+ return pci_dev->current_state >= PCI_D3hot ? 0 : -EBUSY;
238+
+}
239+
+
240+
+static int xen_pcibk_pm_resume(struct device *dev)
241+
+{
242+
+ /* PME bit is always asserted on wakeup, regardless of whether the device supports it or not
243+
+ * This is a non-issue, since guest kernel logic will just wake up the device if it isn't already awake */
244+
+ struct pci_dev *pci_dev = to_pci_dev(dev);
245+
+ struct xen_pcibk_dev_data *dev_data = pci_get_drvdata(pci_dev);
246+
+ dev_data->pme_status = 1;
247+
+
248+
+ return 0;
249+
+}
250+
+
251+
/*add xen_pcibk AER handling*/
252+
static const struct pci_error_handlers xen_pcibk_error_handler = {
253+
.error_detected = xen_pcibk_error_detected,
254+
@@ -1083,6 +1111,8 @@ static const struct pci_error_handlers xen_pcibk_error_handler = {
255+
256+
static const struct dev_pm_ops xen_pcibk_pm_ops = {
257+
.suspend_noirq = xen_pcibk_suspend_noirq,
258+
+ .runtime_idle = xen_pcibk_pm_idle,
259+
+ .runtime_resume = xen_pcibk_pm_resume,
260+
};
261+
262+
/*
263+
diff --git a/drivers/xen/xen-pciback/pciback.h b/drivers/xen/xen-pciback/pciback.h
264+
index cf6df6964664..52774972cc3c 100644
265+
--- a/drivers/xen/xen-pciback/pciback.h
266+
+++ b/drivers/xen/xen-pciback/pciback.h
267+
@@ -56,6 +56,7 @@ struct xen_pcibk_dev_data {
268+
unsigned int isr_on:1; /* Whether the IRQ handler is installed. */
269+
unsigned int ack_intr:1; /* .. and ACK-ing */
270+
unsigned long handled;
271+
+ unsigned int pme_status:1;
272+
unsigned int irq; /* Saved in case device transitions to MSI/MSI-X */
273+
char irq_name[]; /* xen-pcibk[000:04:00.0] */
274+
};
275+
--
276+
2.49.0
277+

0 commit comments

Comments
 (0)