diff --git a/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs b/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs index d86346b..1e1953a 100644 --- a/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs +++ b/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs @@ -491,6 +491,308 @@ public async Task UpdateWifiModuleAsync_WhenDeviceDoesNotSupportLanQuery_Proceed Assert.Contains("SYSTem:COMMUnicate:LAN:FWUpdate", device.SentCommands); } + [Fact] + public async Task UpdateFirmwareAsync_PostReconnectReadinessProbe_AwaitedBeforeComplete() + { + // Closes #145: serial reconnect succeeds before the application + // firmware is ready to answer protobuf status queries. The + // optional readiness probe gives Core a way to wait for true + // application readiness — when set, no caller needs to + // reimplement that retry loop. + // + // The probe here returns false on the first 2 attempts and true + // on the 3rd, simulating a slow PIC32 application boot. Without + // the wait, the update would Complete immediately after serial + // reopens; with the wait, Complete is held back until the probe + // succeeds. + var device = new FakeStreamingDevice("COM3"); + var hidTransport = new FakeHidTransport(); + hidTransport.EnqueueRead([0x01, 0x10]); + hidTransport.EnqueueRead([0x01, 0x02]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x10]); + + var enumerator = new FakeHidDeviceEnumerator([ + Array.Empty(), + [new HidDeviceInfo(0x04D8, 0x003C, "path-1", "SN-1", "DAQiFi Bootloader")] + ]); + + var probeCallCount = 0; + var options = CreateFastOptions(); + options.PostReconnectReadinessProbe = (_, _) => + { + probeCallCount++; + return Task.FromResult(probeCallCount >= 3); + }; + // Readiness budget must be strictly less than JumpingToApplicationTimeout + // (CreateFastOptions uses 2s) — the probe succeeds within 50ms anyway. + options.PostReconnectReadinessTimeout = TimeSpan.FromSeconds(1); + options.PostReconnectReadinessRetryDelay = TimeSpan.FromMilliseconds(10); + + var service = new FirmwareUpdateService( + hidTransport, + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0xA1, 0x01], [0xA1, 0x02]]), + enumerator, + options); + + var hexPath = CreateTempFile(); + try + { + await service.UpdateFirmwareAsync(device, hexPath); + } + finally + { + File.Delete(hexPath); + } + + Assert.Equal(FirmwareUpdateState.Complete, service.CurrentState); + Assert.Equal(3, probeCallCount); + } + + [Fact] + public async Task UpdateFirmwareAsync_PostReconnectReadinessProbe_TimeoutFailsTheUpdate() + { + // When the application never becomes ready within the configured + // budget, the update must transition to Failed with a clear + // timeout message — NOT silently complete and hand back a + // half-ready device. That's the entire reason for #145. + var device = new FakeStreamingDevice("COM3"); + var hidTransport = new FakeHidTransport(); + hidTransport.EnqueueRead([0x01, 0x10]); + hidTransport.EnqueueRead([0x01, 0x02]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x10]); + + var enumerator = new FakeHidDeviceEnumerator([ + Array.Empty(), + [new HidDeviceInfo(0x04D8, 0x003C, "path-1", "SN-1", "DAQiFi Bootloader")] + ]); + + var options = CreateFastOptions(); + // Probe always returns false → never ready → must time out + options.PostReconnectReadinessProbe = (_, _) => Task.FromResult(false); + options.PostReconnectReadinessTimeout = TimeSpan.FromMilliseconds(150); + options.PostReconnectReadinessRetryDelay = TimeSpan.FromMilliseconds(10); + + var service = new FirmwareUpdateService( + hidTransport, + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0xA1, 0x01], [0xA1, 0x02]]), + enumerator, + options); + + var hexPath = CreateTempFile(); + FirmwareUpdateException ex; + try + { + ex = await Assert.ThrowsAsync( + () => service.UpdateFirmwareAsync(device, hexPath)); + } + finally + { + File.Delete(hexPath); + } + + Assert.Equal(FirmwareUpdateState.Failed, service.CurrentState); + Assert.Equal(FirmwareUpdateState.JumpingToApp, ex.FailedState); + var inner = Assert.IsType(ex.InnerException); + Assert.Contains("application-ready", inner.Message); + } + + [Fact] + public async Task UpdateFirmwareAsync_PostReconnectProbeThrowsOwnOCE_RetriesAndCompletes() + { + // The probe may legitimately throw OperationCanceledException on its + // own (e.g. its internal CTS expired) without our timeoutCts firing. + // That must NOT crash the update loop — it should be treated as a + // probe failure and retried, just like any other thrown exception. + var device = new FakeStreamingDevice("COM3"); + var hidTransport = new FakeHidTransport(); + hidTransport.EnqueueRead([0x01, 0x10]); + hidTransport.EnqueueRead([0x01, 0x02]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x10]); + + var enumerator = new FakeHidDeviceEnumerator([ + Array.Empty(), + [new HidDeviceInfo(0x04D8, 0x003C, "path-1", "SN-1", "DAQiFi Bootloader")] + ]); + + var probeCallCount = 0; + var options = CreateFastOptions(); + options.PostReconnectReadinessProbe = (_, _) => + { + probeCallCount++; + // First two attempts: probe throws its own OCE (e.g. internal CTS). + // Third attempt: probe returns ready. + return probeCallCount < 3 + ? Task.FromException(new OperationCanceledException("probe-internal-cancel")) + : Task.FromResult(true); + }; + options.PostReconnectReadinessTimeout = TimeSpan.FromSeconds(1); + options.PostReconnectReadinessRetryDelay = TimeSpan.FromMilliseconds(10); + + var service = new FirmwareUpdateService( + hidTransport, + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0xA1, 0x01], [0xA1, 0x02]]), + enumerator, + options); + + var hexPath = CreateTempFile(); + try + { + await service.UpdateFirmwareAsync(device, hexPath); + } + finally + { + File.Delete(hexPath); + } + + Assert.Equal(FirmwareUpdateState.Complete, service.CurrentState); + Assert.Equal(3, probeCallCount); + } + + [Fact] + public async Task UpdateFirmwareAsync_PostReconnectProbeThrowsThenFalseThenTimeout_DoesNotAttachStaleInner() + { + // The probe throws once, then returns false until the readiness budget + // expires. The TimeoutException must NOT carry the stale exception + // from attempt 1 as InnerException — a successful probe call (true OR + // false) should clear the captured exception. + var device = new FakeStreamingDevice("COM3"); + var hidTransport = new FakeHidTransport(); + hidTransport.EnqueueRead([0x01, 0x10]); + hidTransport.EnqueueRead([0x01, 0x02]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x10]); + + var enumerator = new FakeHidDeviceEnumerator([ + Array.Empty(), + [new HidDeviceInfo(0x04D8, 0x003C, "path-1", "SN-1", "DAQiFi Bootloader")] + ]); + + var probeCallCount = 0; + var options = CreateFastOptions(); + options.PostReconnectReadinessProbe = (_, _) => + { + probeCallCount++; + // Attempt 1: throw. Subsequent attempts: return false until timeout. + return probeCallCount == 1 + ? Task.FromException(new InvalidOperationException("transient-probe-error")) + : Task.FromResult(false); + }; + options.PostReconnectReadinessTimeout = TimeSpan.FromMilliseconds(150); + options.PostReconnectReadinessRetryDelay = TimeSpan.FromMilliseconds(10); + + var service = new FirmwareUpdateService( + hidTransport, + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0xA1, 0x01], [0xA1, 0x02]]), + enumerator, + options); + + var hexPath = CreateTempFile(); + FirmwareUpdateException ex; + try + { + ex = await Assert.ThrowsAsync( + () => service.UpdateFirmwareAsync(device, hexPath)); + } + finally + { + File.Delete(hexPath); + } + + var inner = Assert.IsType(ex.InnerException); + // Critical: the InvalidOperationException from attempt 1 must NOT + // be attached — the successful (false) probe in attempt 2+ cleared + // lastProbeException. InnerException.InnerException should be null. + Assert.Null(inner.InnerException); + } + + [Fact] + public async Task UpdateFirmwareAsync_NoPostReconnectProbe_PreservesLegacyBehavior() + { + // No probe configured == legacy behavior: Complete fires as + // soon as serial reopens. Belt-and-suspenders test that the + // new code path is fully opt-in. + var device = new FakeStreamingDevice("COM3"); + var hidTransport = new FakeHidTransport(); + hidTransport.EnqueueRead([0x01, 0x10]); + hidTransport.EnqueueRead([0x01, 0x02]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x03]); + hidTransport.EnqueueRead([0x01, 0x10]); + + var enumerator = new FakeHidDeviceEnumerator([ + Array.Empty(), + [new HidDeviceInfo(0x04D8, 0x003C, "path-1", "SN-1", "DAQiFi Bootloader")] + ]); + + var options = CreateFastOptions(); + // No PostReconnectReadinessProbe set - legacy path + + var service = new FirmwareUpdateService( + hidTransport, + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0xA1, 0x01], [0xA1, 0x02]]), + enumerator, + options); + + var hexPath = CreateTempFile(); + try + { + await service.UpdateFirmwareAsync(device, hexPath); + } + finally + { + File.Delete(hexPath); + } + + Assert.Equal(FirmwareUpdateState.Complete, service.CurrentState); + } + + [Fact] + public void Constructor_ProbeWithReadinessTimeoutGteJumpToApp_Throws() + { + // The whole JumpToApp step is bounded by JumpingToApplicationTimeout + // via the outer state-timeout. If the readiness budget meets or + // exceeds it, the outer timeout fires first and surfaces a generic + // JumpingToApp error instead of the readiness-specific one. Validate() + // must reject the misconfiguration up front. + var options = CreateFastOptions(); + options.PostReconnectReadinessProbe = (_, _) => Task.FromResult(true); + options.JumpingToApplicationTimeout = TimeSpan.FromSeconds(2); + options.PostReconnectReadinessTimeout = TimeSpan.FromSeconds(2); + + var ex = Assert.Throws(() => new FirmwareUpdateService( + new FakeHidTransport(), + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0xA1, 0x01]]), + new FakeHidDeviceEnumerator([Array.Empty()]), + options)); + + Assert.Equal(nameof(FirmwareUpdateServiceOptions.PostReconnectReadinessTimeout), ex.ParamName); + } + private static FirmwareUpdateServiceOptions CreateFastOptions() { return new FirmwareUpdateServiceOptions diff --git a/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs b/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs index 7118aa5..0787ebd 100644 --- a/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs +++ b/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs @@ -806,6 +806,142 @@ await _hidTransport await SafeDisconnectHidAsync().ConfigureAwait(false); await WaitForSerialReconnectAsync(device, cancellationToken).ConfigureAwait(false); + + // Application-readiness probe (closes #145). Serial transport + // re-enumeration succeeds well before the PIC32 application + // firmware is actually ready to answer protobuf status queries; + // if a downstream flow (LAN chip info, WiFi prep) starts before + // the app is up, those queries fail and callers reimplement + // their own retry. The probe is opt-in via options — when null, + // the legacy "serial reopened == done" semantics apply. + if (_options.PostReconnectReadinessProbe is { } probe) + { + await WaitForApplicationReadyAsync(device, probe, cancellationToken).ConfigureAwait(false); + } + } + + private async Task WaitForApplicationReadyAsync( + IStreamingDevice device, + Func> probe, + CancellationToken cancellationToken) + { + var totalTimeout = _options.PostReconnectReadinessTimeout; + var retryDelay = _options.PostReconnectReadinessRetryDelay; + + // Surface the wait at Information level so observers tailing the + // log can distinguish "stuck" from "deliberately polling". The + // wait can take up to PostReconnectReadinessTimeout (default 30s); + // without this, the JumpingToApp state appears hung beyond the + // initial transport reopen. + _logger.LogInformation( + "Waiting up to {Timeout} for device to become application-ready (post-reconnect readiness probe).", + totalTimeout); + var waitStart = DateTime.UtcNow; + + using var timeoutCts = new CancellationTokenSource(totalTimeout); + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource( + cancellationToken, timeoutCts.Token); + var linkedToken = linkedCts.Token; + + // Capture the most recent probe-thrown exception so a TimeoutException + // can carry the underlying cause as InnerException. Without this, + // deterministic probe failures (e.g. transport says it's open but + // the device never responds to status queries) report only as + // "timed out" — losing the actual error context unless Debug logs + // are on. + Exception? lastProbeException = null; + + // Tracks how many probe invocations have actually run. Distinct from + // the loop iteration counter so the timeout messages don't claim + // "attempt N" when the timeout fired before a probe ever executed. + var probesExecuted = 0; + while (true) + { + try + { + linkedToken.ThrowIfCancellationRequested(); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !cancellationToken.IsCancellationRequested) + { + throw new TimeoutException( + $"Device did not become application-ready within {totalTimeout} (probes executed: {probesExecuted}). " + + "The transport reconnected but the readiness probe never returned true; the device may still be initializing or the firmware may have failed to start.", + lastProbeException); + } + + try + { + probesExecuted++; + // WaitAsync(linkedToken) enforces the timeout deadline even + // when the probe ignores its own CancellationToken and would + // otherwise hang or return after the budget elapses. When + // the deadline fires, WaitAsync throws OperationCanceledException + // immediately — we don't keep waiting for the rogue probe. + var isReady = await probe(device, linkedToken) + .WaitAsync(linkedToken) + .ConfigureAwait(false); + + // Successful probe invocation (true OR false) means the most + // recent attempt completed normally. Clear lastProbeException + // so a later timeout doesn't carry forward a stale exception + // from an earlier failed attempt as its InnerException. + lastProbeException = null; + + if (isReady) + { + var elapsed = DateTime.UtcNow - waitStart; + _logger.LogInformation( + "Device became application-ready after {Elapsed} on probe attempt {Attempt}.", + elapsed, + probesExecuted); + return; + } + _logger.LogDebug("Application-ready probe returned false on attempt {Attempt}; will retry.", probesExecuted); + } + catch (OperationCanceledException ex) when (!cancellationToken.IsCancellationRequested) + { + // Two cases reach here: + // 1. The wait deadline fired (timeoutCts canceled) — surface + // as TimeoutException so callers see the readiness budget. + // 2. The probe itself threw OperationCanceledException for + // some unrelated reason (its own internal CTS, etc). That + // must NOT crash the update loop — treat it as a probe + // failure and retry, same as any other thrown exception. + if (timeoutCts.IsCancellationRequested) + { + throw new TimeoutException( + $"Device did not become application-ready within {totalTimeout} (probes executed: {probesExecuted}). " + + "The wait for the readiness probe was canceled by the timeout — note the probe may ignore cancellation and continue running in the background.", + lastProbeException ?? ex); + } + + lastProbeException = ex; + _logger.LogDebug( + ex, + "Application-ready probe was canceled on attempt {Attempt}; treating as not-ready and retrying.", + probesExecuted); + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + lastProbeException = ex; + _logger.LogDebug( + ex, + "Application-ready probe threw on attempt {Attempt}; treating as not-ready and retrying.", + probesExecuted); + } + + try + { + await Task.Delay(retryDelay, linkedToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !cancellationToken.IsCancellationRequested) + { + throw new TimeoutException( + $"Device did not become application-ready within {totalTimeout} (probes executed: {probesExecuted}). " + + "The transport reconnected but the readiness probe never returned true; the device may still be initializing or the firmware may have failed to start.", + lastProbeException); + } + } } private async Task WaitForSerialReconnectAsync( diff --git a/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs b/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs index 4b75984..fd79e49 100644 --- a/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs +++ b/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs @@ -1,3 +1,5 @@ +using Daqifi.Core.Device; + namespace Daqifi.Core.Firmware; /// @@ -119,6 +121,43 @@ public sealed class FirmwareUpdateServiceOptions /// public string? WifiPortOverride { get; set; } + /// + /// Optional callback that returns true once the device is ready to + /// answer normal application commands after PIC32 firmware update + + /// reconnect (closes #145). The serial transport can re-enumerate + /// well before the application firmware is actually ready to respond + /// to protobuf status queries — without this probe, the next steps + /// in a downstream flow (LAN chip-info, WiFi prep) hit a half-started + /// device and have to retry. When set, the firmware service polls + /// the probe with bounded retry after each PIC32 reconnect; if it + /// never returns true within , + /// the update transitions to Failed with a clear timeout exception + /// rather than silently handing back a half-ready device. When null, + /// reconnect succeeds as soon as the serial port reopens (legacy + /// behavior). + /// + public Func>? PostReconnectReadinessProbe { get; set; } + + /// + /// Wall-clock budget for the post-reconnect readiness probe. Default + /// 30s covers a slow PIC32 boot and downstream-firmware initialization. + /// + /// + /// This budget runs INSIDE the JumpingToApp state, so the effective + /// upper bound is JumpingToApplicationTimeout - (serial reconnect + /// elapsed). With the defaults (45s state timeout, ~1-5s typical + /// reconnect, 30s readiness budget) the readiness probe gets its full + /// budget; if you raise this near or above the state timeout, the + /// outer state-timeout will fire first and callers will see a generic + /// JumpingToApp timeout instead of the readiness-specific message. + /// + public TimeSpan PostReconnectReadinessTimeout { get; set; } = TimeSpan.FromSeconds(30); + + /// + /// Delay between readiness-probe attempts (cancellation-aware). + /// + public TimeSpan PostReconnectReadinessRetryDelay { get; set; } = TimeSpan.FromMilliseconds(500); + /// /// Gets the configured timeout for a given firmware update state. /// @@ -160,6 +199,30 @@ public void Validate() ValidatePositive(FlashWriteRetryDelay, nameof(FlashWriteRetryDelay)); ValidatePositive(WifiProcessTimeout, nameof(WifiProcessTimeout)); + // The readiness options only matter when a probe is configured — + // gate every readiness validation behind that, both the positive + // checks and the cross-property constraint, so callers that don't + // use the probe never have to think about these timeouts. + if (PostReconnectReadinessProbe is not null) + { + ValidatePositive(PostReconnectReadinessTimeout, nameof(PostReconnectReadinessTimeout)); + ValidatePositive(PostReconnectReadinessRetryDelay, nameof(PostReconnectReadinessRetryDelay)); + + // The whole JumpToApp step is wrapped by JumpingToApplicationTimeout, + // so a probe budget that meets or exceeds it would always be cut off + // by the outer state-timeout — surfacing a generic JumpingToApp error + // instead of the readiness-specific message. Reject the configuration + // up front so misconfigurations fail fast. + if (PostReconnectReadinessTimeout >= JumpingToApplicationTimeout) + { + throw new ArgumentOutOfRangeException( + nameof(PostReconnectReadinessTimeout), + PostReconnectReadinessTimeout, + $"{nameof(PostReconnectReadinessTimeout)} must be strictly less than {nameof(JumpingToApplicationTimeout)} when {nameof(PostReconnectReadinessProbe)} is set, " + + "otherwise the outer state-timeout fires first and masks the readiness-specific timeout."); + } + } + if (HidConnectRetryCount < 1) { throw new ArgumentOutOfRangeException(