diff --git a/WebReaper/API.md b/WebReaper/API.md index c3d9734..4a7fd2c 100644 --- a/WebReaper/API.md +++ b/WebReaper/API.md @@ -11,8 +11,40 @@ - [Get(startUrls)](#M-WebReaper-Builders-ConfigBuilder-Get-System-String[]- 'WebReaper.Builders.ConfigBuilder.Get(System.String[])') - [GetWithBrowser(startUrls,pageActions)](#M-WebReaper-Builders-ConfigBuilder-GetWithBrowser-System-Collections-Generic-IEnumerable{System-String},System-Collections-Generic-List{WebReaper-Domain-PageActions-PageAction}- 'WebReaper.Builders.ConfigBuilder.GetWithBrowser(System.Collections.Generic.IEnumerable{System.String},System.Collections.Generic.List{WebReaper.Domain.PageActions.PageAction})') - [FileScraperConfigStorage](#T-WebReaper-ConfigStorage-Concrete-FileScraperConfigStorage 'WebReaper.ConfigStorage.Concrete.FileScraperConfigStorage') +- [IProxyProposalProvider](#T-WebReaper-Proxy-Abstract-IProxyProposalProvider 'WebReaper.Proxy.Abstract.IProxyProposalProvider') + - [GetProxiesAsync()](#M-WebReaper-Proxy-Abstract-IProxyProposalProvider-GetProxiesAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Abstract.IProxyProposalProvider.GetProxiesAsync(System.Threading.CancellationToken)') +- [IProxyProposalValidator](#T-WebReaper-Proxy-Abstract-IProxyProposalValidator 'WebReaper.Proxy.Abstract.IProxyProposalValidator') + - [ValidateAsync()](#M-WebReaper-Proxy-Abstract-IProxyProposalValidator-ValidateAsync-System-Net-WebProxy,System-Threading-CancellationToken- 'WebReaper.Proxy.Abstract.IProxyProposalValidator.ValidateAsync(System.Net.WebProxy,System.Threading.CancellationToken)') +- [IProxyProvider](#T-WebReaper-Proxy-Abstract-IProxyProvider 'WebReaper.Proxy.Abstract.IProxyProvider') + - [GetProxyAsync()](#M-WebReaper-Proxy-Abstract-IProxyProvider-GetProxyAsync 'WebReaper.Proxy.Abstract.IProxyProvider.GetProxyAsync') +- [IValidatedProxyListProvider](#T-WebReaper-Proxy-Abstract-IValidatedProxyListProvider 'WebReaper.Proxy.Abstract.IValidatedProxyListProvider') + - [GetProxiesAsync()](#M-WebReaper-Proxy-Abstract-IValidatedProxyListProvider-GetProxiesAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Abstract.IValidatedProxyListProvider.GetProxiesAsync(System.Threading.CancellationToken)') - [InMemoryCookieStorage](#T-WebReaper-Core-CookieStorage-Concrete-InMemoryCookieStorage 'WebReaper.Core.CookieStorage.Concrete.InMemoryCookieStorage') +- [PingTimeoutProxyProposalValidator](#T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator') + - [#ctor()](#M-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator-#ctor-Microsoft-Extensions-Options-IOptions{WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions}- 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator.#ctor(Microsoft.Extensions.Options.IOptions{WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions})') + - [ValidateAsync()](#M-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator-ValidateAsync-System-Net-WebProxy,System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator.ValidateAsync(System.Net.WebProxy,System.Threading.CancellationToken)') +- [PingTimeoutValidatorOptions](#T-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions 'WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions') + - [ProbeTimeout](#P-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions-ProbeTimeout 'WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions.ProbeTimeout') + - [ProbeUrl](#P-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions-ProbeUrl 'WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions.ProbeUrl') +- [ProxyProposalValidationResult](#T-WebReaper-Proxy-Concrete-ProxyProposalValidationResult 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult') + - [Default](#F-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Default 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Default') + - [Error](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Error 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Error') + - [IsDefault](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsDefault 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsDefault') + - [IsInvalid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsInvalid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsInvalid') + - [IsValid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsValid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsValid') + - [Invalid()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Invalid-System-Exception- 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Invalid(System.Exception)') + - [Valid()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Valid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Valid') +- [ProxyProposalValidatorService](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService') + - [#ctor()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-#ctor-Microsoft-Extensions-Options-IOptions{WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions},Microsoft-Extensions-Logging-ILogger{WebReaper-Proxy-Concrete-ProxyProposalValidatorService},System-Collections-Generic-IEnumerable{WebReaper-Proxy-Abstract-IProxyProposalProvider},System-Collections-Generic-IEnumerable{WebReaper-Proxy-Abstract-IProxyProposalValidator}- 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService.#ctor(Microsoft.Extensions.Options.IOptions{WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions},Microsoft.Extensions.Logging.ILogger{WebReaper.Proxy.Concrete.ProxyProposalValidatorService},System.Collections.Generic.IEnumerable{WebReaper.Proxy.Abstract.IProxyProposalProvider},System.Collections.Generic.IEnumerable{WebReaper.Proxy.Abstract.IProxyProposalValidator})') + - [ExecuteAsync()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-ExecuteAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService.ExecuteAsync(System.Threading.CancellationToken)') + - [GetProxiesAsync()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-GetProxiesAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService.GetProxiesAsync(System.Threading.CancellationToken)') +- [ProxyProposalValidatorServiceOptions](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions 'WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions') + - [ValidationInterval](#P-WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions-ValidationInterval 'WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions.ValidationInterval') - [ScraperEngineBuilder](#T-WebReaper-Builders-ScraperEngineBuilder 'WebReaper.Builders.ScraperEngineBuilder') +- [ValidatedProxyProvider](#T-WebReaper-Proxy-Concrete-ValidatedProxyProvider 'WebReaper.Proxy.Concrete.ValidatedProxyProvider') + - [#ctor()](#M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-#ctor-WebReaper-Proxy-Abstract-IValidatedProxyListProvider- 'WebReaper.Proxy.Concrete.ValidatedProxyProvider.#ctor(WebReaper.Proxy.Abstract.IValidatedProxyListProvider)') + - [GetProxyAsync()](#M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-GetProxyAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.ValidatedProxyProvider.GetProxyAsync(System.Threading.CancellationToken)') + - [GetProxyAsync()](#M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-GetProxyAsync 'WebReaper.Proxy.Concrete.ValidatedProxyProvider.GetProxyAsync') ## BrowserPageLoader `type` @@ -105,6 +137,98 @@ WebReaper.ConfigStorage.Concrete *Inherit from parent.* + +## IProxyProposalProvider `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Supplies a list of unvalidated proxies. + + +### GetProxiesAsync() `method` + +##### Summary + +Returns a list of potential proxies, which may or may not be valid. + +##### Parameters + +This method has no parameters. + + +## IProxyProposalValidator `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Validates a proposed proxy. + + +### ValidateAsync() `method` + +##### Summary + +Validates a proposed proxy. + +##### Returns + +A [ProxyProposalValidationResult](#T-WebReaper-Proxy-Concrete-ProxyProposalValidationResult 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult') indicating whether the proxy is valid or invalid, or the validator does not apply to the result. + +##### Parameters + +This method has no parameters. + + +## IProxyProvider `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Provides a validated proxy. + + +### GetProxyAsync() `method` + +##### Summary + +Returns a validated proxy. + +##### Parameters + +This method has no parameters. + + +## IValidatedProxyListProvider `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Supplies a list of validated, ready to use proxies. + + +### GetProxiesAsync() `method` + +##### Summary + +Returns a list of validated proxies. + +##### Parameters + +This method has no parameters. + ## InMemoryCookieStorage `type` @@ -116,6 +240,198 @@ WebReaper.Core.CookieStorage.Concrete *Inherit from parent.* + +## PingTimeoutProxyProposalValidator `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Validates a proxy by requesting a URL and waiting for a response. + + +### #ctor() `constructor` + +##### Summary + +Initializes a new instance of the [PingTimeoutProxyProposalValidator](#T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator') class. + +##### Parameters + +This constructor has no parameters. + + +### ValidateAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + + +## PingTimeoutValidatorOptions `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Options for [PingTimeoutProxyProposalValidator](#T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator'). + + +### ProbeTimeout `property` + +##### Summary + +The maximum time to wait for a response from the probe URL. + + +### ProbeUrl `property` + +##### Summary + +The URL to visit to validate the proxy. + + +## ProxyProposalValidationResult `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +The result of validating a proxy. + +##### Remarks + +Either [IsValid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsValid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsValid') or [IsInvalid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsInvalid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsInvalid') will be `true` when initialized. + + +### Default `constants` + +##### Summary + +A default result. + + +### Error `property` + +##### Summary + +The error, if any. + + +### IsDefault `property` + +##### Summary + +Whether the result is the default result. + + +### IsInvalid `property` + +##### Summary + +Whether the result is invalid. + + +### IsValid `property` + +##### Summary + +Whether the result is valid. + + +### Invalid() `method` + +##### Summary + +An invalid result, with an error. + +##### Parameters + +This method has no parameters. + + +### Valid() `method` + +##### Summary + +A valid result. + +##### Parameters + +This method has no parameters. + + +## ProxyProposalValidatorService `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Periodically validates proxies and supplies a the most recently validated list of proxies. + + +### #ctor() `constructor` + +##### Summary + +Periodically validates proxies and supplies a the most recently validated list of proxies. + +##### Parameters + +This constructor has no parameters. + + +### ExecuteAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + + +### GetProxiesAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + + +## ProxyProposalValidatorServiceOptions `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Options for [ProxyProposalValidatorService](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService'). + + +### ValidationInterval `property` + +##### Summary + +The interval at which to validate proxies. + ## ScraperEngineBuilder `type` @@ -126,3 +442,51 @@ WebReaper.Builders ##### Summary Builds a web scraper engine responsible for creating and receiving crawling jobs and running a spider on them + + +## ValidatedProxyProvider `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Provides a random validated proxy. + +##### See Also + +- [WebReaper.Proxy.Concrete.ProxyProposalValidatorService](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService') + + +### #ctor() `constructor` + +##### Summary + +Initializes a new instance of the [ValidatedProxyProvider](#T-WebReaper-Proxy-Concrete-ValidatedProxyProvider 'WebReaper.Proxy.Concrete.ValidatedProxyProvider') class. + +##### Parameters + +This constructor has no parameters. + + +### GetProxyAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + + +### GetProxyAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. diff --git a/WebReaper/API.xml b/WebReaper/API.xml index e9536b2..3bd719a 100644 --- a/WebReaper/API.xml +++ b/WebReaper/API.xml @@ -51,5 +51,160 @@ Logger + + + Supplies a list of unvalidated proxies. + + + + + Returns a list of potential proxies, which may or may not be valid. + + + + + Validates a proposed proxy. + + + + + Validates a proposed proxy. + + A indicating whether the proxy is valid or invalid, or the validator does not apply to the result. + + + + Provides a validated proxy. + + + + + Returns a validated proxy. + + + + + Supplies a list of validated, ready to use proxies. + + + + + Returns a list of validated proxies. + + + + + Options for . + + + + + The URL to visit to validate the proxy. + + + + + The maximum time to wait for a response from the probe URL. + + + + + Validates a proxy by requesting a URL and waiting for a response. + + + + + Initializes a new instance of the class. + + + + + + + + The result of validating a proxy. + + + Either or will be true when initialized. + + + + + A default result. + + + + + A valid result. + + + + + An invalid result, with an error. + + + + + Whether the result is the default result. + + + + + Whether the result is valid. + + + + + Whether the result is invalid. + + + + + The error, if any. + + + + + Options for . + + + + + The interval at which to validate proxies. + + + + + Periodically validates proxies and supplies a the most recently validated list of proxies. + + + + + Periodically validates proxies and supplies a the most recently validated list of proxies. + + + + + + + + + + + Provides a random validated proxy. + + + + + + Initializes a new instance of the class. + + + + + + + + diff --git a/WebReaper/Extensions/EnumerableExtensions.cs b/WebReaper/Extensions/EnumerableExtensions.cs new file mode 100644 index 0000000..fc54fbd --- /dev/null +++ b/WebReaper/Extensions/EnumerableExtensions.cs @@ -0,0 +1,66 @@ +using System; + +namespace WebReaper.Extensions; + +internal static class EnumerableExtensions +{ + public static IEnumerable SelectTruthy(this IEnumerable enumerable, Func predicate) + where U : class + { + foreach (var item in enumerable) + { + if (predicate(item) is { } result) + { + yield return result; + } + } + } + + public static IEnumerable SelectTruthy(this IEnumerable enumerable) + { + foreach (var item in enumerable) + { + if (item is { } result) + { + yield return result; + } + } + } + + public static IEnumerable SelectTruthy(this IEnumerable enumerable, Func predicate) where U : struct + { + foreach (var item in enumerable) + { + if (predicate(item) is { } result) + { + yield return result; + } + } + } + + public static IEnumerable SelectTruthy(this IEnumerable enumerable) where T : struct + { + foreach (var item in enumerable) + { + if (item is { } result) + { + yield return result; + } + } + } + + public static T ChooseRandom(this IEnumerable enumerable, Random? random = null) + { + random ??= Random.Shared; + if (enumerable.TryGetNonEnumeratedCount(out var count)) + { + var index = random.Next(count); + return enumerable.ElementAt(index); + } + else + { + var list = enumerable.ToList(); + return list[random.Next(list.Count)]; + } + } +} diff --git a/WebReaper/Proxy/Abstract/IProxyProposalProvider.cs b/WebReaper/Proxy/Abstract/IProxyProposalProvider.cs new file mode 100644 index 0000000..b1d7852 --- /dev/null +++ b/WebReaper/Proxy/Abstract/IProxyProposalProvider.cs @@ -0,0 +1,15 @@ +using System; +using System.Net; + +namespace WebReaper.Proxy.Abstract; + +/// +/// Supplies a list of unvalidated proxies. +/// +public interface IProxyProposalProvider +{ + /// + /// Returns a list of potential proxies, which may or may not be valid. + /// + Task> GetProxiesAsync(CancellationToken cancellationToken = default); +} diff --git a/WebReaper/Proxy/Abstract/IProxyProposalValidator.cs b/WebReaper/Proxy/Abstract/IProxyProposalValidator.cs new file mode 100644 index 0000000..dc4a63e --- /dev/null +++ b/WebReaper/Proxy/Abstract/IProxyProposalValidator.cs @@ -0,0 +1,17 @@ +using System; +using System.Net; +using WebReaper.Proxy.Concrete; + +namespace WebReaper.Proxy.Abstract; + +/// +/// Validates a proposed proxy. +/// +public interface IProxyProposalValidator +{ + /// + /// Validates a proposed proxy. + /// + /// A indicating whether the proxy is valid or invalid, or the validator does not apply to the result. + Task ValidateAsync(WebProxy proxy, CancellationToken cancellationToken = default); +} diff --git a/WebReaper/Proxy/Abstract/IProxyProvider.cs b/WebReaper/Proxy/Abstract/IProxyProvider.cs index 62e8eab..173e23f 100644 --- a/WebReaper/Proxy/Abstract/IProxyProvider.cs +++ b/WebReaper/Proxy/Abstract/IProxyProvider.cs @@ -2,7 +2,13 @@ namespace WebReaper.Proxy.Abstract; +/// +/// Provides a validated proxy. +/// public interface IProxyProvider { + /// + /// Returns a validated proxy. + /// Task GetProxyAsync(); } \ No newline at end of file diff --git a/WebReaper/Proxy/Abstract/IValidatedProxyListProvider.cs b/WebReaper/Proxy/Abstract/IValidatedProxyListProvider.cs new file mode 100644 index 0000000..e769008 --- /dev/null +++ b/WebReaper/Proxy/Abstract/IValidatedProxyListProvider.cs @@ -0,0 +1,15 @@ +using System; +using System.Net; + +namespace WebReaper.Proxy.Abstract; + +/// +/// Supplies a list of validated, ready to use proxies. +/// +public interface IValidatedProxyListProvider +{ + /// + /// Returns a list of validated proxies. + /// + Task> GetProxiesAsync(CancellationToken cancellationToken = default); +} diff --git a/WebReaper/Proxy/Concrete/PingTimeoutProxyProposalValidator.cs b/WebReaper/Proxy/Concrete/PingTimeoutProxyProposalValidator.cs new file mode 100644 index 0000000..3c32c90 --- /dev/null +++ b/WebReaper/Proxy/Concrete/PingTimeoutProxyProposalValidator.cs @@ -0,0 +1,74 @@ +using System; +using System.Net; +using Microsoft.Extensions.Options; +using WebReaper.Proxy.Abstract; + +namespace WebReaper.Proxy.Concrete; + +/// +/// Options for . +/// +public sealed class PingTimeoutValidatorOptions : IOptions +{ + /// + /// The URL to visit to validate the proxy. + /// + public Uri ProbeUrl { get; set; } = new("https://www.cloudflare.com/"); + /// + /// The maximum time to wait for a response from the probe URL. + /// + public TimeSpan ProbeTimeout { get; set; } = TimeSpan.FromSeconds(5); + PingTimeoutValidatorOptions IOptions.Value => this; +} + +/// +/// Validates a proxy by requesting a URL and waiting for a response. +/// +public sealed class PingTimeoutProxyProposalValidator : IProxyProposalValidator +{ + private readonly PingTimeoutValidatorOptions _options; + + /// + /// Initializes a new instance of the class. + /// + public PingTimeoutProxyProposalValidator(IOptions options) + { + _options = options.Value; + } + /// + + public async Task ValidateAsync(WebProxy proxy, CancellationToken cancellationToken = default) + { + using HttpMessageHandler h = new HttpClientHandler + { + Proxy = proxy, + UseProxy = true + }; + using var client = new HttpClient(h, false) + { + Timeout = _options.ProbeTimeout + }; + try + { + var response = await client.GetAsync(_options.ProbeUrl, cancellationToken); + response.EnsureSuccessStatusCode(); + return ProxyProposalValidationResult.Valid(); + } + catch (AggregateException ex) + { + if (ex.InnerExceptions.All(ex => ex is OperationCanceledException)) + { + return default; + } + return ProxyProposalValidationResult.Invalid(ex); + } + catch (OperationCanceledException) + { + return default; + } + catch (Exception ex) + { + return ProxyProposalValidationResult.Invalid(ex); + } + } +} diff --git a/WebReaper/Proxy/Concrete/ProxyProposalValidationResult.cs b/WebReaper/Proxy/Concrete/ProxyProposalValidationResult.cs new file mode 100644 index 0000000..68ab7cf --- /dev/null +++ b/WebReaper/Proxy/Concrete/ProxyProposalValidationResult.cs @@ -0,0 +1,57 @@ +using System; + +namespace WebReaper.Proxy.Concrete; + +/// +/// The result of validating a proxy. +/// +/// +/// Either or will be true when initialized. +/// +public readonly struct ProxyProposalValidationResult +{ + private readonly Kind _kind; + + ProxyProposalValidationResult(Kind kind, Exception? error = null) + { + _kind = kind; + } + + /// + /// A default result. + /// + public static ProxyProposalValidationResult Default = new ProxyProposalValidationResult(Kind.Default); + + /// + /// A valid result. + /// + public static ProxyProposalValidationResult Valid() => new ProxyProposalValidationResult(Kind.Valid); + /// + /// An invalid result, with an error. + /// + public static ProxyProposalValidationResult Invalid(Exception error) => new ProxyProposalValidationResult(Kind.Invalid, error); + + /// + /// Whether the result is the default result. + /// + public bool IsDefault => _kind == Kind.Default; + /// + /// Whether the result is valid. + /// + public bool IsValid => _kind == Kind.Valid; + /// + /// Whether the result is invalid. + /// + public bool IsInvalid => _kind == Kind.Invalid; + /// + /// The error, if any. + /// + public Exception? Error { get; } + + enum Kind + { + Default, + Valid, + Invalid + } +} diff --git a/WebReaper/Proxy/Concrete/ProxyProsposalValidatorService.cs b/WebReaper/Proxy/Concrete/ProxyProsposalValidatorService.cs new file mode 100644 index 0000000..87a7ae1 --- /dev/null +++ b/WebReaper/Proxy/Concrete/ProxyProsposalValidatorService.cs @@ -0,0 +1,107 @@ +using System; +using System.Net; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using WebReaper.Extensions; +using WebReaper.Proxy.Abstract; + +namespace WebReaper.Proxy.Concrete; + +/// +/// Options for . +/// +public sealed class ProxyProposalValidatorServiceOptions : IOptions +{ + /// + /// The interval at which to validate proxies. + /// + public TimeSpan ValidationInterval { get; set; } = TimeSpan.FromMinutes(2); + ProxyProposalValidatorServiceOptions IOptions.Value => this; +} + +/// +/// Periodically validates proxies and supplies a the most recently validated list of proxies. +/// +public sealed class ProxyProposalValidatorService : BackgroundService, IValidatedProxyListProvider +{ + private readonly ProxyProposalValidatorServiceOptions _options; + private readonly ILogger _logger; + private readonly IEnumerable _proxySuppliers; + private readonly IEnumerable _proxyValidators; + private TaskCompletionSource> _proxiesCompletion = new(); + + /// + /// Periodically validates proxies and supplies a the most recently validated list of proxies. + /// + public ProxyProposalValidatorService( + IOptions options, + ILogger logger, + IEnumerable proxySuppliers, + IEnumerable proxyValidators + ) + { + _options = options.Value; + _logger = logger; + _proxySuppliers = proxySuppliers; + _proxyValidators = proxyValidators; + } + + /// + public Task> GetProxiesAsync(CancellationToken cancellationToken = default) + { + return _proxiesCompletion.Task.WaitAsync(cancellationToken); + } + + /// + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + while (!stoppingToken.IsCancellationRequested) + { + var proxies = await Task.WhenAll(_proxySuppliers.Select(supplier => supplier.GetProxiesAsync(stoppingToken))); + var validatedProxies = await Task.WhenAll(proxies + .SelectMany(proxy => proxy) + .Select(proxy => FilterAvailableProxy(proxy, stoppingToken)) + ); + // update the completion source + UpdateValidatedProxies(validatedProxies.SelectTruthy()); + + await Task.Delay(_options.ValidationInterval, stoppingToken); + stoppingToken.ThrowIfCancellationRequested(); + } + } + + private void UpdateValidatedProxies(IEnumerable validatedProxies) + { + // Try to set the uncompleted task + if (!_proxiesCompletion.TrySetResult(validatedProxies)) + { + // Replace the completed task with a new completed task + TaskCompletionSource> completion = new(); + completion.SetResult(validatedProxies); + _proxiesCompletion = completion; + } + } + + private async Task FilterAvailableProxy(WebProxy proxy, CancellationToken stoppingToken) + { + var result = await ValidateProxy(proxy, stoppingToken); + if (result.IsInvalid) + { + _logger.LogWarning(result.Error, "Proxy {proxy} is invalid", proxy.Address); + return null; + } + return proxy; + } + + private async Task ValidateProxy(WebProxy webProxy, CancellationToken cancellationToken = default) + { + var results = await Task.WhenAll(_proxyValidators.Select(async validator => await validator.ValidateAsync(webProxy, cancellationToken))); + if (results.All(x => !x.IsInvalid)) + { + return ProxyProposalValidationResult.Valid(); + } + AggregateException error = new("No valid proxy found", results.SelectTruthy(x => x.Error)); + return ProxyProposalValidationResult.Invalid(error); + } +} diff --git a/WebReaper/Proxy/Concrete/ValidatedProxyProvider.cs b/WebReaper/Proxy/Concrete/ValidatedProxyProvider.cs new file mode 100644 index 0000000..bc8ed5d --- /dev/null +++ b/WebReaper/Proxy/Concrete/ValidatedProxyProvider.cs @@ -0,0 +1,36 @@ +using System; +using System.Net; +using WebReaper.Extensions; +using WebReaper.Proxy.Abstract; + +namespace WebReaper.Proxy.Concrete; + +/// +/// Provides a random validated proxy. +/// +/// +public sealed class ValidatedProxyProvider : IProxyProvider +{ + private readonly IValidatedProxyListProvider _validatedProxySource; + + /// + /// Initializes a new instance of the class. + /// + public ValidatedProxyProvider(IValidatedProxyListProvider validatedProxySource) + { + _validatedProxySource = validatedProxySource; + } + + /// + public async Task GetProxyAsync(CancellationToken cancellationToken = default) + { + var proxies = await _validatedProxySource.GetProxiesAsync(cancellationToken); + return proxies.ChooseRandom(); + } + + /// + public Task GetProxyAsync() + { + return GetProxyAsync(default); + } +} diff --git a/WebReaper/WebReaper.csproj b/WebReaper/WebReaper.csproj index 349921d..f5fb4c2 100644 --- a/WebReaper/WebReaper.csproj +++ b/WebReaper/WebReaper.csproj @@ -42,6 +42,8 @@ + +