diff --git a/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs b/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs index 549a52e..3eb0948 100644 --- a/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs +++ b/src/Daqifi.Core.Tests/Firmware/FirmwareUpdateServiceTests.cs @@ -730,6 +730,197 @@ public async Task CheckWifiFirmwareStatusAsync_ReentrantCallFromUpdateProgressCa Assert.Equal(FirmwareUpdateState.Complete, service.CurrentState); } + [Fact] + public async Task CheckWifiFirmwareStatusAsync_WhenLanChipInfoFailsTransiently_RetriesUntilSuccess() + { + // Closes #144: post-PIC32-reboot the WiFi subsystem can lag the + // application by a few seconds, so the first chip-info query + // transiently fails. Without retry, the WiFi version decision + // would short-circuit and trigger an unnecessary multi-minute + // reflash. The retry budget covers the startup window. + var wifiRelease = new FirmwareReleaseInfo + { + Version = new FirmwareVersion(19, 5, 4, null, 0), + TagName = "19.5.4", + IsPreRelease = false + }; + var device = new FakeLanChipInfoStreamingDevice( + "COM14", + chipInfo: new LanChipInfo + { + ChipId = 1234, + FwVersion = "19.5.4", + BuildDate = "Jan 8 2019" + }, + transientFailuresBeforeSuccess: 2); + + var options = CreateFastOptions(); + options.LanChipInfoMaxAttempts = 3; + options.LanChipInfoRetryDelay = TimeSpan.FromMilliseconds(5); // Keep test fast + + var service = new FirmwareUpdateService( + new FakeHidTransport(), + new FakeFirmwareDownloadService { LatestWifiRelease = wifiRelease }, + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0x10]]), + new FakeHidDeviceEnumerator([]), + options); + + var status = await service.CheckWifiFirmwareStatusAsync(device); + + Assert.Equal(3, device.GetLanChipInfoCallCount); + Assert.True(status.IsUpToDate); + Assert.Equal(WifiFirmwareStatusReason.UpToDate, status.Reason); + } + + [Fact] + public async Task CheckWifiFirmwareStatusAsync_WhenLanChipInfoFailsAllAttempts_ReturnsChipInfoUnavailable() + { + // After exhausting LanChipInfoMaxAttempts the planning method must + // fall through to ChipInfoUnavailable, NOT hang or surface the + // raw exception. This preserves the "couldn't check, default to + // running update" semantics for genuinely-broken devices. + var device = new FakeLanChipInfoStreamingDevice( + "COM15", + chipInfo: new LanChipInfo + { + ChipId = 1234, + FwVersion = "19.5.4", + BuildDate = "Jan 8 2019" + }, + transientFailuresBeforeSuccess: 99); + + var options = CreateFastOptions(); + options.LanChipInfoMaxAttempts = 3; + options.LanChipInfoRetryDelay = TimeSpan.FromMilliseconds(5); + + var service = new FirmwareUpdateService( + new FakeHidTransport(), + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0x10]]), + new FakeHidDeviceEnumerator([]), + options); + + var status = await service.CheckWifiFirmwareStatusAsync(device); + + Assert.Equal(3, device.GetLanChipInfoCallCount); + Assert.False(status.IsUpToDate); + Assert.Equal(WifiFirmwareStatusReason.ChipInfoUnavailable, status.Reason); + } + + [Fact] + public async Task CheckWifiFirmwareStatusAsync_TotalTimeoutHit_ShortCircuitsToChipInfoUnavailable() + { + // Closes a Qodo follow-up on PR #199: per-attempt query timeouts + // compound with retry delays, so a high MaxAttempts × non-trivial + // per-attempt latency could block far beyond the configured retry + // budget while holding _operationLock. The total-timeout cap caps + // wall-clock time independent of attempt counts. + // + // The fake's per-attempt latency is the Task.Delay below; with + // 200ms latency × 10 max attempts × 100ms retry delay, a naive + // implementation would block ~2.9s. The 100ms total timeout + // forces an early ChipInfoUnavailable return after the first + // attempt's latency exceeds the budget. + var device = new SlowFakeLanChipInfoStreamingDevice( + "COM17", + attemptLatency: TimeSpan.FromMilliseconds(200)); + + var options = CreateFastOptions(); + options.LanChipInfoMaxAttempts = 10; + options.LanChipInfoRetryDelay = TimeSpan.FromMilliseconds(100); + options.LanChipInfoTotalTimeout = TimeSpan.FromMilliseconds(100); + + var service = new FirmwareUpdateService( + new FakeHidTransport(), + new FakeFirmwareDownloadService(), + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0x10]]), + new FakeHidDeviceEnumerator([]), + options); + + var stopwatch = System.Diagnostics.Stopwatch.StartNew(); + var status = await service.CheckWifiFirmwareStatusAsync(device); + stopwatch.Stop(); + + Assert.False(status.IsUpToDate); + Assert.Equal(WifiFirmwareStatusReason.ChipInfoUnavailable, status.Reason); + // Should bail well before attempting all 10 × (200ms + 100ms) = 3s. + // Allowing 1500ms for CI variance / xunit overhead. + Assert.True(stopwatch.Elapsed < TimeSpan.FromMilliseconds(1500), + $"Probe took {stopwatch.ElapsedMilliseconds}ms, expected <1500ms with TotalTimeout=100ms."); + } + + private sealed class SlowFakeLanChipInfoStreamingDevice : IStreamingDevice, ILanChipInfoProvider + { + private readonly TimeSpan _attemptLatency; + public SlowFakeLanChipInfoStreamingDevice(string name, TimeSpan attemptLatency) + { + Name = name; + _attemptLatency = attemptLatency; + IsConnected = true; + } + public string Name { get; } + public IPAddress? IpAddress => null; + public bool IsConnected { get; private set; } + public ConnectionStatus Status => ConnectionStatus.Connected; + public int StreamingFrequency { get; set; } + public bool IsStreaming { get; private set; } + public event EventHandler? StatusChanged { add { } remove { } } + public event EventHandler? MessageReceived { add { } remove { } } + public void Connect() => IsConnected = true; + public void Disconnect() => IsConnected = false; + public void Send(IOutboundMessage message) { } + public void StartStreaming() => IsStreaming = true; + public void StopStreaming() => IsStreaming = false; + public async Task GetLanChipInfoAsync(CancellationToken cancellationToken = default) + { + await Task.Delay(_attemptLatency, cancellationToken).ConfigureAwait(false); + return null; + } + } + + [Fact] + public async Task CheckWifiFirmwareStatusAsync_FirstAttemptSucceeds_DoesNotRetry() + { + // Steady-state path: no retry overhead when the first call works. + var wifiRelease = new FirmwareReleaseInfo + { + Version = new FirmwareVersion(19, 5, 4, null, 0), + TagName = "19.5.4", + IsPreRelease = false + }; + var device = new FakeLanChipInfoStreamingDevice( + "COM16", + chipInfo: new LanChipInfo + { + ChipId = 1234, + FwVersion = "19.5.4", + BuildDate = "Jan 8 2019" + }); + + var options = CreateFastOptions(); + options.LanChipInfoMaxAttempts = 3; + options.LanChipInfoRetryDelay = TimeSpan.FromMilliseconds(5); + + var service = new FirmwareUpdateService( + new FakeHidTransport(), + new FakeFirmwareDownloadService { LatestWifiRelease = wifiRelease }, + new FakeExternalProcessRunner(), + NullLogger.Instance, + new FakeBootloaderProtocol([[0x10]]), + new FakeHidDeviceEnumerator([]), + options); + + await service.CheckWifiFirmwareStatusAsync(device); + + Assert.Equal(1, device.GetLanChipInfoCallCount); + } + private sealed class SyncProgress : IProgress { private readonly Action _handler; @@ -848,14 +1039,18 @@ private sealed class FakeLanChipInfoStreamingDevice : IStreamingDevice, ILanChip { private readonly LanChipInfo? _chipInfo; private ConnectionStatus _status = ConnectionStatus.Connected; + private int _remainingTransientFailures; - public FakeLanChipInfoStreamingDevice(string name, LanChipInfo? chipInfo) + public FakeLanChipInfoStreamingDevice(string name, LanChipInfo? chipInfo, int transientFailuresBeforeSuccess = 0) { Name = name; _chipInfo = chipInfo; + _remainingTransientFailures = transientFailuresBeforeSuccess; IsConnected = true; } + public int GetLanChipInfoCallCount { get; private set; } + public string Name { get; } public IPAddress? IpAddress => null; public bool IsConnected { get; private set; } @@ -899,6 +1094,16 @@ public void Send(IOutboundMessage message) public Task GetLanChipInfoAsync(CancellationToken cancellationToken = default) { + GetLanChipInfoCallCount++; + if (_remainingTransientFailures > 0) + { + _remainingTransientFailures--; + // Faulted task (not sync throw) more accurately simulates how + // a real async method surfaces failure — caller's await sees a + // genuinely-async exception path. + return Task.FromException( + new InvalidOperationException("Simulated transient post-reboot failure.")); + } return Task.FromResult(_chipInfo); } } diff --git a/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs b/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs index a248d36..41c2ff3 100644 --- a/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs +++ b/src/Daqifi.Core/Firmware/FirmwareUpdateService.cs @@ -536,21 +536,15 @@ private async Task CheckWifiFirmwareStatusCoreAsync( }; } - LanChipInfo? chipInfo; - try - { - chipInfo = await lanChipInfoProvider.GetLanChipInfoAsync(cancellationToken).ConfigureAwait(false); - } - catch (Exception ex) when (ex is not OperationCanceledException) - { - _logger.LogDebug(ex, "Failed to query LAN chip info; reporting status as ChipInfoUnavailable."); - return new WifiFirmwareStatus - { - IsUpToDate = false, - Reason = WifiFirmwareStatusReason.ChipInfoUnavailable, - }; - } - + // Bounded retry for the LAN chip-info probe (closes #144). Right + // after a PIC32 firmware update the application is up while WiFi + // is still finishing startup, so the first chip-info query can + // transiently fail; without retry, the WiFi version decision + // would short-circuit to ChipInfoUnavailable and flow on to a + // multi-minute reflash of already-current WiFi firmware. The + // retry budget is bounded (LanChipInfoMaxAttempts × RetryDelay) + // and observes cancellation between attempts. + var chipInfo = await TryGetLanChipInfoWithRetryAsync(lanChipInfoProvider, cancellationToken).ConfigureAwait(false); if (chipInfo == null) { return new WifiFirmwareStatus @@ -613,6 +607,102 @@ private async Task CheckWifiFirmwareStatusCoreAsync( }; } + private async Task TryGetLanChipInfoWithRetryAsync( + ILanChipInfoProvider lanChipInfoProvider, + CancellationToken cancellationToken) + { + var maxAttempts = Math.Max(1, _options.LanChipInfoMaxAttempts); + var retryDelay = _options.LanChipInfoRetryDelay; + var totalTimeout = _options.LanChipInfoTotalTimeout; + + // Wall-clock budget guards against the pathological case where + // attempt-count × per-attempt-timeout + retry-delay sum vastly + // exceeds the configured retry budget (e.g., 3 × 2s device timeout + // + 2 × 2s delay = ~10s while _operationLock is held). Linking + // the caller's CT preserves cancellation semantics; the timeout + // CTS just adds a deadline. + using var timeoutCts = new CancellationTokenSource(totalTimeout); + using var linkedCts = CancellationTokenSource.CreateLinkedTokenSource( + cancellationToken, timeoutCts.Token); + var linkedToken = linkedCts.Token; + + for (var attempt = 1; attempt <= maxAttempts; attempt++) + { + try + { + linkedToken.ThrowIfCancellationRequested(); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !cancellationToken.IsCancellationRequested) + { + _logger.LogDebug( + "LAN chip-info probe hit total timeout ({Timeout}) before attempt {Attempt}/{Max}.", + totalTimeout, + attempt, + maxAttempts); + return null; + } + + try + { + var chipInfo = await lanChipInfoProvider.GetLanChipInfoAsync(linkedToken).ConfigureAwait(false); + if (chipInfo != null) + { + if (attempt > 1) + { + _logger.LogDebug( + "LAN chip-info query succeeded on attempt {Attempt}/{Max}.", + attempt, + maxAttempts); + } + return chipInfo; + } + _logger.LogDebug( + "LAN chip-info query returned null on attempt {Attempt}/{Max}.", + attempt, + maxAttempts); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !cancellationToken.IsCancellationRequested) + { + _logger.LogDebug( + "LAN chip-info probe hit total timeout ({Timeout}) during attempt {Attempt}/{Max}.", + totalTimeout, + attempt, + maxAttempts); + return null; + } + catch (Exception ex) when (ex is not OperationCanceledException) + { + _logger.LogDebug( + ex, + "LAN chip-info query failed on attempt {Attempt}/{Max}.", + attempt, + maxAttempts); + } + + if (attempt < maxAttempts) + { + try + { + await Task.Delay(retryDelay, linkedToken).ConfigureAwait(false); + } + catch (OperationCanceledException) when (timeoutCts.IsCancellationRequested && !cancellationToken.IsCancellationRequested) + { + _logger.LogDebug( + "LAN chip-info probe hit total timeout ({Timeout}) during retry delay after attempt {Attempt}/{Max}.", + totalTimeout, + attempt, + maxAttempts); + return null; + } + } + } + + _logger.LogDebug( + "LAN chip-info query exhausted {Max} attempts; reporting status as ChipInfoUnavailable.", + maxAttempts); + return null; + } + private ExternalProcessRequest BuildWifiProcessRequest( IStreamingDevice device, string firmwarePath, diff --git a/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs b/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs index 4b75984..8d920b4 100644 --- a/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs +++ b/src/Daqifi.Core/Firmware/FirmwareUpdateServiceOptions.cs @@ -119,6 +119,43 @@ public sealed class FirmwareUpdateServiceOptions /// public string? WifiPortOverride { get; set; } + /// + /// Total attempts (initial + retries) for LAN chip-info queries before + /// the WiFi version decision falls through to "couldn't check, proceed + /// with flash". Right after a PIC32 firmware update the application is + /// typically up while the WiFi subsystem is still finishing startup, so + /// the first chip-info query can transiently fail; bounded retry covers + /// that window so callers don't unnecessarily reflash up-to-date WiFi + /// firmware (closes #144). + /// + /// + /// Each attempt also incurs the per-attempt timeout from the device + /// implementation (e.g., DaqifiStreamingDevice.GetLanChipInfoAsync + /// uses 2s). Total wall-clock budget is therefore + /// sum(attempt durations) + (MaxAttempts-1) * RetryDelay; with the + /// 2s device default, 3 attempts and 2s delay sum to ~10s in the worst + /// case. The retry loop holds _operationLock, so use + /// to cap the actual wall-clock + /// time independent of attempt counts. + /// + public int LanChipInfoMaxAttempts { get; set; } = 3; + + /// + /// Delay between LAN chip-info retry attempts (cancellation-aware). + /// + public TimeSpan LanChipInfoRetryDelay { get; set; } = TimeSpan.FromSeconds(2); + + /// + /// Hard upper bound on wall-clock time spent in the LAN chip-info + /// probe (including per-attempt query timeouts and retry delays). + /// When exceeded, the loop short-circuits to ChipInfoUnavailable + /// regardless of remaining attempts. Prevents pathological multi- + /// attempt cases (e.g., 3 attempts × 2s device timeout + 2 × 2s delays + /// = ~10s) from stalling firmware flows or UI status probes that hold + /// the operation lock. + /// + public TimeSpan LanChipInfoTotalTimeout { get; set; } = TimeSpan.FromSeconds(8); + /// /// Gets the configured timeout for a given firmware update state. /// @@ -176,6 +213,17 @@ public void Validate() "Flash write retry count must be at least 1."); } + if (LanChipInfoMaxAttempts < 1) + { + throw new ArgumentOutOfRangeException( + nameof(LanChipInfoMaxAttempts), + LanChipInfoMaxAttempts, + "LAN chip-info max attempts must be at least 1."); + } + + ValidatePositive(LanChipInfoRetryDelay, nameof(LanChipInfoRetryDelay)); + ValidatePositive(LanChipInfoTotalTimeout, nameof(LanChipInfoTotalTimeout)); + if (BootloaderVendorId < 0 || BootloaderVendorId > 0xFFFF) { throw new ArgumentOutOfRangeException(