Skip to content

Commit 800e543

Browse files
committed
power-policy-service: Clean up recovery logic and add more documentation
* Recovery flow now explicity sets recovery power instead of relying on `attempt_provider_recovery` * Remove `Result` return types from functions only using it when checking for invalid data in the devices list * Update documentation in docs and in the power-policy-service module
1 parent c2a40d3 commit 800e543

File tree

5 files changed

+117
-49
lines changed

5 files changed

+117
-49
lines changed

docs/power-policy.md

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ stateDiagram-v2
2727
Provider --> Detached : NotifyDetach
2828
Idle --> Detached : NotifyDetach
2929
```
30+
31+
#### Device recovery
32+
Requests to a device can fail for any number of reasons (bus communication issues, deadlock in driver code, etc). In most cases this won't result in any issues as most failed requests result in the system being in a safe state. E.g. a failed transition from `Idle` to `ConnectedConsummer` will result in the system just not drawing power. However, a failed request in the `ConnectedProvider` state can leave the system providing more power than intended. The device state isn't enough to capture this situation as the device must be assumed to still be the in the `ConnectedProvider` state. The `Device` struct contains a `recovery` member to track this situation. The specifics of the recovery process are implementation defined.
33+
3034
### Policy Messages
3135
These messages are sent from a device to the power policy.
3236

embedded-service/src/power/policy/action/policy.rs

+1-5
Original file line numberDiff line numberDiff line change
@@ -108,11 +108,7 @@ impl<'a> Policy<'a, ConnectedProvider> {
108108
/// Disconnect this device
109109
pub async fn disconnect(self) -> Result<Policy<'a, Idle>, Error> {
110110
if let Err(e) = self.disconnect_internal().await {
111-
error!(
112-
"Error disconnecting device {}: {:?}, entering recovery mode",
113-
self.device.id().0,
114-
e
115-
);
111+
error!("Error disconnecting device {}: {:?}", self.device.id().0, e);
116112
self.device.enter_recovery().await;
117113
return Err(e);
118114
}

examples/std/src/bin/power_policy.rs

+30-6
Original file line numberDiff line numberDiff line change
@@ -171,16 +171,40 @@ async fn run(spawner: Spawner) {
171171
let device0 = device0.attach().await.unwrap();
172172
device0.request_provider_power_capability(LOW_POWER).await.unwrap();
173173
Timer::after_millis(250).await;
174-
// First rejected request is connect request
175-
// Second rejected request is disconnect request
176-
// Third rejected request is recovery disconnect
177-
info!("Rejecting next 3 requests");
178-
device0_mock.reject_next_requests(3);
174+
// Requests we're rejecting:
175+
// Connect request
176+
// Disconnect request after failing connect request
177+
// Request to connect at recovery limit
178+
// Disconnect request after failing recovery limit
179+
// First recovery disconnect from `attempt_provider_recovery`
180+
// Next recovery disconnect completes
181+
info!("Rejecting next 5 requests");
182+
device0_mock.reject_next_requests(5);
179183

180184
// Attach device 1, device 0 will fail provider connect request and trigger recovery flow
181185
let device1 = device1.attach().await.unwrap();
182186
device1.request_provider_power_capability(LOW_POWER).await.unwrap();
183-
Timer::after_millis(5000).await;
187+
188+
// Wait for the recovery flow to start
189+
while !device0_mock.device.is_in_recovery().await {
190+
info!("Waiting for recovery flow to start");
191+
Timer::after_millis(100).await;
192+
}
193+
194+
// Wait for the recovery flow to complete
195+
while device0_mock.device.is_in_recovery().await {
196+
info!("Waiting for recovery flow to complete");
197+
Timer::after_millis(100).await;
198+
}
199+
200+
// Reconnect device 0
201+
info!("Reconnecting device 0 as provider");
202+
device0.request_provider_power_capability(LOW_POWER).await.unwrap();
203+
// Wait for device 0 to reconnect
204+
while !device0_mock.device.is_provider().await {
205+
info!("Waiting for device 0 to reconnect");
206+
Timer::after_millis(1000).await;
207+
}
184208

185209
// Disconnect device 1
186210
info!("Disconnecting device 1");

power-policy-service/src/lib.rs

+4-3
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,7 @@ impl PowerPolicy {
6767
async fn process_notify_detach(&self) -> Result<(), Error> {
6868
self.context.send_response(Ok(policy::ResponseData::Complete)).await;
6969
self.update_current_consumer().await?;
70-
self.update_providers(None).await?;
70+
self.update_providers(None).await;
7171
Ok(())
7272
}
7373

@@ -79,14 +79,15 @@ impl PowerPolicy {
7979

8080
async fn process_request_provider_power_capabilities(&self, device: DeviceId) -> Result<(), Error> {
8181
self.context.send_response(Ok(policy::ResponseData::Complete)).await;
82-
self.update_providers(Some(device)).await?;
82+
self.update_providers(Some(device)).await;
8383
Ok(())
8484
}
8585

8686
async fn process_notify_disconnect(&self) -> Result<(), Error> {
8787
self.context.send_response(Ok(policy::ResponseData::Complete)).await;
8888
self.update_current_consumer().await?;
89-
self.update_providers(None).await
89+
self.update_providers(None).await;
90+
Ok(())
9091
}
9192

9293
/// Send a notification with the comms service

power-policy-service/src/provider.rs

+78-35
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,13 @@
33
//! the system is in unlimited power state. In this mode [provider_unlimited](super::Config::provider_unlimited)
44
//! is provided to each device. Above this threshold, the system is in limited power state.
55
//! In this mode [provider_limited](super::Config::provider_limited) is provided to each device
6-
use embedded_services::{debug, trace};
6+
//! Lastly, the system can be in recovery mode. This mode is only entered when a connected provider fails to
7+
//! connect at a new power level. In this mode, all connected providers are set to
8+
//! [provider_recovery](super::Config::provider_recovery). While in this mode
9+
//! [attempt_provider_recovery](PowerPolicy::attempt_provider_recovery) is called periodically
10+
//! which attempts to disconnect all providers in recovery mode. If this succeeds, the system will
11+
//! return to normal operating mode.
12+
use embedded_services::{debug, trace, warn};
713

814
use super::*;
915

@@ -29,16 +35,22 @@ pub(super) struct State {
2935

3036
impl PowerPolicy {
3137
/// Computes the total requested power considering all current providers
32-
async fn compute_total_provider_power(&self, new_request: bool) -> Result<PowerState, Error> {
38+
async fn compute_total_provider_power(&self, new_request: bool) -> PowerState {
3339
let mut num_providers = if new_request { 1 } else { 0 };
3440

3541
for device in self.context.devices().await {
36-
let device = device.data::<device::Device>().ok_or(Error::InvalidDevice)?;
42+
let device = device.data::<device::Device>();
43+
if device.is_none() {
44+
// A non-power device somehow got into the list of devices, note it and move on
45+
warn!("Found non-power device in devices list");
46+
continue;
47+
}
3748

49+
let device = device.unwrap();
3850
if device.is_in_recovery().await {
3951
// If any device is in recovery mode, we need to recover
4052
info!("Device {}: In recovery mode", device.id().0);
41-
return Ok(PowerState::Recovery);
53+
return PowerState::Recovery;
4254
}
4355

4456
if device.is_provider().await {
@@ -48,19 +60,25 @@ impl PowerPolicy {
4860

4961
let total_provided_power = num_providers * self.config.provider_unlimited.max_power_mw();
5062
if total_provided_power > self.config.limited_power_threshold_mw {
51-
Ok(PowerState::Limited)
63+
PowerState::Limited
5264
} else {
53-
Ok(PowerState::Unlimited)
65+
PowerState::Unlimited
5466
}
5567
}
5668

5769
/// Update the power capability of all connected providers
5870
/// Returns true if we need to enter recovery mode
59-
async fn update_provider_capability(&self, target_power: PowerCapability) -> Result<bool, Error> {
71+
async fn update_provider_capability(&self, target_power: PowerCapability, exit_on_recovery: bool) -> bool {
6072
let mut recovery = false;
6173
for device in self.context.devices().await {
62-
let device = device.data::<device::Device>().ok_or(Error::InvalidDevice)?;
74+
let device = device.data::<device::Device>();
75+
if device.is_none() {
76+
// A non-power device somehow got into the list of devices, note it and move on
77+
warn!("Found non-power device in devices list");
78+
continue;
79+
}
6380

81+
let device = device.unwrap();
6482
if let Ok(action) = self
6583
.context
6684
.try_policy_action::<action::ConnectedProvider>(device.id())
@@ -76,27 +94,34 @@ impl PowerPolicy {
7694
);
7795

7896
if let Err(_) = action.disconnect().await {
79-
error!(
80-
"Device{}: Failed to disconnect provider, entering recovery mode",
81-
device.id().0
82-
);
97+
error!("Device{}: Failed to disconnect provider", device.id().0);
98+
99+
// Early exit if that's what we want
100+
// This is used to avoid excessively switching power capabilities in the recovery flow
101+
if exit_on_recovery {
102+
return true;
103+
}
104+
83105
recovery = true;
84106
}
85107
}
86108
}
87109
}
88110
}
89-
Ok(recovery)
111+
112+
return recovery;
90113
}
91114

92115
/// Update the provider state of currently connected providers
93-
pub(super) async fn update_providers(&self, new_provider: Option<DeviceId>) -> Result<(), Error> {
116+
pub(super) async fn update_providers(&self, new_provider: Option<DeviceId>) {
94117
trace!("Updating providers");
95118
let mut state = self.state.lock().await;
119+
let mut already_in_recovery = true;
96120

97121
if state.current_provider_state.state != PowerState::Recovery {
98122
// Only update the power state if we're not in recovery mode
99-
state.current_provider_state.state = self.compute_total_provider_power(new_provider.is_some()).await?;
123+
already_in_recovery = false;
124+
state.current_provider_state.state = self.compute_total_provider_power(new_provider.is_some()).await;
100125
}
101126
debug!("New power state: {:?}", state.current_provider_state.state);
102127

@@ -106,24 +131,39 @@ impl PowerPolicy {
106131
PowerState::Limited => self.config.provider_limited,
107132
};
108133

109-
let recovery = self.update_provider_capability(target_power).await?;
134+
let recovery = self.update_provider_capability(target_power, true).await;
110135
if let Some(new_provider) = new_provider {
111136
info!("Connecting new provider");
112-
if let Ok(action) = self.context.try_policy_action::<action::Idle>(new_provider).await {
113-
action.connect_provider(target_power).await?;
137+
let connected = if let Ok(action) = self.context.try_policy_action::<action::Idle>(new_provider).await {
138+
let target_power = if recovery {
139+
// We entered recovery mode so attempt to connect at the recovery power
140+
self.config.provider_recovery
141+
} else {
142+
target_power
143+
};
144+
action.connect_provider(target_power).await.is_ok()
114145
} else {
115-
// Don't enter recovery mode if we can't connect to the new provider
116-
// Since it's a new provider that hasn't been connected then it's
117-
// not drawing power as far as we're concerned
146+
false
147+
};
148+
149+
// Don't enter recovery mode if we can't connect the new provider.
150+
// Since it's a new provider that hasn't been connected then it's
151+
// not drawing power as far as we're concerned
152+
if !connected {
118153
error!("Device {}: Failed to connect provider", new_provider.0);
119154
}
120155
}
121156

122-
if recovery {
157+
if recovery && !already_in_recovery {
158+
// Entering recovery, set power capability on all responding providers to recovery limit
159+
// Don't check return value of update_provider_capability here, if we've spontaneously recovered
160+
// then it'll get caught by the next call of attempt_provider_recovery
161+
info!("Entering recovery mode");
162+
let _ = self
163+
.update_provider_capability(self.config.provider_recovery, false)
164+
.await;
123165
state.current_provider_state.state = PowerState::Recovery;
124166
}
125-
126-
Ok(())
127167
}
128168

129169
pub(super) async fn attempt_provider_recovery(&self) {
@@ -134,10 +174,13 @@ impl PowerPolicy {
134174
}
135175

136176
info!("Attempting provider recovery");
177+
let mut recovered = true;
137178
// Attempt to by disconnecting all providers in recovery
138179
for device in self.context.devices().await {
139180
let device = device.data::<device::Device>().ok_or(Error::InvalidDevice);
140181
if device.is_err() {
182+
// A non-power device somehow got into the list of devices, note it and move on
183+
warn!("Found non-power device in devices list");
141184
continue;
142185
}
143186

@@ -150,25 +193,25 @@ impl PowerPolicy {
150193
{
151194
if let Err(_) = action.disconnect().await {
152195
error!("Device {}: Failed to recover", device.id().0);
196+
recovered = false;
153197
}
154198
}
155199
}
156200
}
157201

158-
// Attempt to restart in the limited power state
159-
self.state.lock().await.current_provider_state.state = PowerState::Limited;
160-
if self.update_providers(None).await.is_err() {
161-
// Failed to update providers, stay in recovery mode
162-
info!("Failed to update providers, staying in recovery mode");
202+
if !recovered {
203+
info!("Failed to recover all providers, staying in recovery mode");
163204
return;
164205
}
165206

166-
if self.state.lock().await.current_provider_state.state != PowerState::Recovery {
167-
// Successfully recovered
168-
info!("Successfully recovered from provider recovery mode");
169-
} else {
170-
// Still in recovery mode
171-
info!("Still in provider recovery mode");
207+
// Attempt to restart in the unlimited power state
208+
self.state.lock().await.current_provider_state.state = PowerState::Unlimited;
209+
self.update_providers(None).await;
210+
if self.state.lock().await.current_provider_state.state == PowerState::Recovery {
211+
info!("Failed to update providers, staying in recovery mode");
212+
return;
172213
}
214+
215+
info!("Successfully recovered from provider recovery mode");
173216
}
174217
}

0 commit comments

Comments
 (0)