diff --git a/WebReaper/API.md b/WebReaper/API.md index c3d9734..4a7fd2c 100644 --- a/WebReaper/API.md +++ b/WebReaper/API.md @@ -11,8 +11,40 @@ - [Get(startUrls)](#M-WebReaper-Builders-ConfigBuilder-Get-System-String[]- 'WebReaper.Builders.ConfigBuilder.Get(System.String[])') - [GetWithBrowser(startUrls,pageActions)](#M-WebReaper-Builders-ConfigBuilder-GetWithBrowser-System-Collections-Generic-IEnumerable{System-String},System-Collections-Generic-List{WebReaper-Domain-PageActions-PageAction}- 'WebReaper.Builders.ConfigBuilder.GetWithBrowser(System.Collections.Generic.IEnumerable{System.String},System.Collections.Generic.List{WebReaper.Domain.PageActions.PageAction})') - [FileScraperConfigStorage](#T-WebReaper-ConfigStorage-Concrete-FileScraperConfigStorage 'WebReaper.ConfigStorage.Concrete.FileScraperConfigStorage') +- [IProxyProposalProvider](#T-WebReaper-Proxy-Abstract-IProxyProposalProvider 'WebReaper.Proxy.Abstract.IProxyProposalProvider') + - [GetProxiesAsync()](#M-WebReaper-Proxy-Abstract-IProxyProposalProvider-GetProxiesAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Abstract.IProxyProposalProvider.GetProxiesAsync(System.Threading.CancellationToken)') +- [IProxyProposalValidator](#T-WebReaper-Proxy-Abstract-IProxyProposalValidator 'WebReaper.Proxy.Abstract.IProxyProposalValidator') + - [ValidateAsync()](#M-WebReaper-Proxy-Abstract-IProxyProposalValidator-ValidateAsync-System-Net-WebProxy,System-Threading-CancellationToken- 'WebReaper.Proxy.Abstract.IProxyProposalValidator.ValidateAsync(System.Net.WebProxy,System.Threading.CancellationToken)') +- [IProxyProvider](#T-WebReaper-Proxy-Abstract-IProxyProvider 'WebReaper.Proxy.Abstract.IProxyProvider') + - [GetProxyAsync()](#M-WebReaper-Proxy-Abstract-IProxyProvider-GetProxyAsync 'WebReaper.Proxy.Abstract.IProxyProvider.GetProxyAsync') +- [IValidatedProxyListProvider](#T-WebReaper-Proxy-Abstract-IValidatedProxyListProvider 'WebReaper.Proxy.Abstract.IValidatedProxyListProvider') + - [GetProxiesAsync()](#M-WebReaper-Proxy-Abstract-IValidatedProxyListProvider-GetProxiesAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Abstract.IValidatedProxyListProvider.GetProxiesAsync(System.Threading.CancellationToken)') - [InMemoryCookieStorage](#T-WebReaper-Core-CookieStorage-Concrete-InMemoryCookieStorage 'WebReaper.Core.CookieStorage.Concrete.InMemoryCookieStorage') +- [PingTimeoutProxyProposalValidator](#T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator') + - [#ctor()](#M-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator-#ctor-Microsoft-Extensions-Options-IOptions{WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions}- 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator.#ctor(Microsoft.Extensions.Options.IOptions{WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions})') + - [ValidateAsync()](#M-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator-ValidateAsync-System-Net-WebProxy,System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator.ValidateAsync(System.Net.WebProxy,System.Threading.CancellationToken)') +- [PingTimeoutValidatorOptions](#T-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions 'WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions') + - [ProbeTimeout](#P-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions-ProbeTimeout 'WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions.ProbeTimeout') + - [ProbeUrl](#P-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions-ProbeUrl 'WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions.ProbeUrl') +- [ProxyProposalValidationResult](#T-WebReaper-Proxy-Concrete-ProxyProposalValidationResult 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult') + - [Default](#F-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Default 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Default') + - [Error](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Error 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Error') + - [IsDefault](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsDefault 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsDefault') + - [IsInvalid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsInvalid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsInvalid') + - [IsValid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsValid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsValid') + - [Invalid()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Invalid-System-Exception- 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Invalid(System.Exception)') + - [Valid()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Valid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Valid') +- [ProxyProposalValidatorService](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService') + - [#ctor()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-#ctor-Microsoft-Extensions-Options-IOptions{WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions},Microsoft-Extensions-Logging-ILogger{WebReaper-Proxy-Concrete-ProxyProposalValidatorService},System-Collections-Generic-IEnumerable{WebReaper-Proxy-Abstract-IProxyProposalProvider},System-Collections-Generic-IEnumerable{WebReaper-Proxy-Abstract-IProxyProposalValidator}- 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService.#ctor(Microsoft.Extensions.Options.IOptions{WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions},Microsoft.Extensions.Logging.ILogger{WebReaper.Proxy.Concrete.ProxyProposalValidatorService},System.Collections.Generic.IEnumerable{WebReaper.Proxy.Abstract.IProxyProposalProvider},System.Collections.Generic.IEnumerable{WebReaper.Proxy.Abstract.IProxyProposalValidator})') + - [ExecuteAsync()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-ExecuteAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService.ExecuteAsync(System.Threading.CancellationToken)') + - [GetProxiesAsync()](#M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-GetProxiesAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService.GetProxiesAsync(System.Threading.CancellationToken)') +- [ProxyProposalValidatorServiceOptions](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions 'WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions') + - [ValidationInterval](#P-WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions-ValidationInterval 'WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions.ValidationInterval') - [ScraperEngineBuilder](#T-WebReaper-Builders-ScraperEngineBuilder 'WebReaper.Builders.ScraperEngineBuilder') +- [ValidatedProxyProvider](#T-WebReaper-Proxy-Concrete-ValidatedProxyProvider 'WebReaper.Proxy.Concrete.ValidatedProxyProvider') + - [#ctor()](#M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-#ctor-WebReaper-Proxy-Abstract-IValidatedProxyListProvider- 'WebReaper.Proxy.Concrete.ValidatedProxyProvider.#ctor(WebReaper.Proxy.Abstract.IValidatedProxyListProvider)') + - [GetProxyAsync()](#M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-GetProxyAsync-System-Threading-CancellationToken- 'WebReaper.Proxy.Concrete.ValidatedProxyProvider.GetProxyAsync(System.Threading.CancellationToken)') + - [GetProxyAsync()](#M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-GetProxyAsync 'WebReaper.Proxy.Concrete.ValidatedProxyProvider.GetProxyAsync') <a name='T-WebReaper-Core-Loaders-Abstract-BrowserPageLoader'></a> ## BrowserPageLoader `type` @@ -105,6 +137,98 @@ WebReaper.ConfigStorage.Concrete *Inherit from parent.* +<a name='T-WebReaper-Proxy-Abstract-IProxyProposalProvider'></a> +## IProxyProposalProvider `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Supplies a list of unvalidated proxies. + +<a name='M-WebReaper-Proxy-Abstract-IProxyProposalProvider-GetProxiesAsync-System-Threading-CancellationToken-'></a> +### GetProxiesAsync() `method` + +##### Summary + +Returns a list of potential proxies, which may or may not be valid. + +##### Parameters + +This method has no parameters. + +<a name='T-WebReaper-Proxy-Abstract-IProxyProposalValidator'></a> +## IProxyProposalValidator `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Validates a proposed proxy. + +<a name='M-WebReaper-Proxy-Abstract-IProxyProposalValidator-ValidateAsync-System-Net-WebProxy,System-Threading-CancellationToken-'></a> +### ValidateAsync() `method` + +##### Summary + +Validates a proposed proxy. + +##### Returns + +A [ProxyProposalValidationResult](#T-WebReaper-Proxy-Concrete-ProxyProposalValidationResult 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult') indicating whether the proxy is valid or invalid, or the validator does not apply to the result. + +##### Parameters + +This method has no parameters. + +<a name='T-WebReaper-Proxy-Abstract-IProxyProvider'></a> +## IProxyProvider `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Provides a validated proxy. + +<a name='M-WebReaper-Proxy-Abstract-IProxyProvider-GetProxyAsync'></a> +### GetProxyAsync() `method` + +##### Summary + +Returns a validated proxy. + +##### Parameters + +This method has no parameters. + +<a name='T-WebReaper-Proxy-Abstract-IValidatedProxyListProvider'></a> +## IValidatedProxyListProvider `type` + +##### Namespace + +WebReaper.Proxy.Abstract + +##### Summary + +Supplies a list of validated, ready to use proxies. + +<a name='M-WebReaper-Proxy-Abstract-IValidatedProxyListProvider-GetProxiesAsync-System-Threading-CancellationToken-'></a> +### GetProxiesAsync() `method` + +##### Summary + +Returns a list of validated proxies. + +##### Parameters + +This method has no parameters. + <a name='T-WebReaper-Core-CookieStorage-Concrete-InMemoryCookieStorage'></a> ## InMemoryCookieStorage `type` @@ -116,6 +240,198 @@ WebReaper.Core.CookieStorage.Concrete *Inherit from parent.* +<a name='T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator'></a> +## PingTimeoutProxyProposalValidator `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Validates a proxy by requesting a URL and waiting for a response. + +<a name='M-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator-#ctor-Microsoft-Extensions-Options-IOptions{WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions}-'></a> +### #ctor() `constructor` + +##### Summary + +Initializes a new instance of the [PingTimeoutProxyProposalValidator](#T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator') class. + +##### Parameters + +This constructor has no parameters. + +<a name='M-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator-ValidateAsync-System-Net-WebProxy,System-Threading-CancellationToken-'></a> +### ValidateAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + +<a name='T-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions'></a> +## PingTimeoutValidatorOptions `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Options for [PingTimeoutProxyProposalValidator](#T-WebReaper-Proxy-Concrete-PingTimeoutProxyProposalValidator 'WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator'). + +<a name='P-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions-ProbeTimeout'></a> +### ProbeTimeout `property` + +##### Summary + +The maximum time to wait for a response from the probe URL. + +<a name='P-WebReaper-Proxy-Concrete-PingTimeoutValidatorOptions-ProbeUrl'></a> +### ProbeUrl `property` + +##### Summary + +The URL to visit to validate the proxy. + +<a name='T-WebReaper-Proxy-Concrete-ProxyProposalValidationResult'></a> +## ProxyProposalValidationResult `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +The result of validating a proxy. + +##### Remarks + +Either [IsValid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsValid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsValid') or [IsInvalid](#P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsInvalid 'WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsInvalid') will be `true` when initialized. + +<a name='F-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Default'></a> +### Default `constants` + +##### Summary + +A default result. + +<a name='P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Error'></a> +### Error `property` + +##### Summary + +The error, if any. + +<a name='P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsDefault'></a> +### IsDefault `property` + +##### Summary + +Whether the result is the default result. + +<a name='P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsInvalid'></a> +### IsInvalid `property` + +##### Summary + +Whether the result is invalid. + +<a name='P-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-IsValid'></a> +### IsValid `property` + +##### Summary + +Whether the result is valid. + +<a name='M-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Invalid-System-Exception-'></a> +### Invalid() `method` + +##### Summary + +An invalid result, with an error. + +##### Parameters + +This method has no parameters. + +<a name='M-WebReaper-Proxy-Concrete-ProxyProposalValidationResult-Valid'></a> +### Valid() `method` + +##### Summary + +A valid result. + +##### Parameters + +This method has no parameters. + +<a name='T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService'></a> +## ProxyProposalValidatorService `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Periodically validates proxies and supplies a the most recently validated list of proxies. + +<a name='M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-#ctor-Microsoft-Extensions-Options-IOptions{WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions},Microsoft-Extensions-Logging-ILogger{WebReaper-Proxy-Concrete-ProxyProposalValidatorService},System-Collections-Generic-IEnumerable{WebReaper-Proxy-Abstract-IProxyProposalProvider},System-Collections-Generic-IEnumerable{WebReaper-Proxy-Abstract-IProxyProposalValidator}-'></a> +### #ctor() `constructor` + +##### Summary + +Periodically validates proxies and supplies a the most recently validated list of proxies. + +##### Parameters + +This constructor has no parameters. + +<a name='M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-ExecuteAsync-System-Threading-CancellationToken-'></a> +### ExecuteAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + +<a name='M-WebReaper-Proxy-Concrete-ProxyProposalValidatorService-GetProxiesAsync-System-Threading-CancellationToken-'></a> +### GetProxiesAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + +<a name='T-WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions'></a> +## ProxyProposalValidatorServiceOptions `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Options for [ProxyProposalValidatorService](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService'). + +<a name='P-WebReaper-Proxy-Concrete-ProxyProposalValidatorServiceOptions-ValidationInterval'></a> +### ValidationInterval `property` + +##### Summary + +The interval at which to validate proxies. + <a name='T-WebReaper-Builders-ScraperEngineBuilder'></a> ## ScraperEngineBuilder `type` @@ -126,3 +442,51 @@ WebReaper.Builders ##### Summary Builds a web scraper engine responsible for creating and receiving crawling jobs and running a spider on them + +<a name='T-WebReaper-Proxy-Concrete-ValidatedProxyProvider'></a> +## ValidatedProxyProvider `type` + +##### Namespace + +WebReaper.Proxy.Concrete + +##### Summary + +Provides a random validated proxy. + +##### See Also + +- [WebReaper.Proxy.Concrete.ProxyProposalValidatorService](#T-WebReaper-Proxy-Concrete-ProxyProposalValidatorService 'WebReaper.Proxy.Concrete.ProxyProposalValidatorService') + +<a name='M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-#ctor-WebReaper-Proxy-Abstract-IValidatedProxyListProvider-'></a> +### #ctor() `constructor` + +##### Summary + +Initializes a new instance of the [ValidatedProxyProvider](#T-WebReaper-Proxy-Concrete-ValidatedProxyProvider 'WebReaper.Proxy.Concrete.ValidatedProxyProvider') class. + +##### Parameters + +This constructor has no parameters. + +<a name='M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-GetProxyAsync-System-Threading-CancellationToken-'></a> +### GetProxyAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. + +<a name='M-WebReaper-Proxy-Concrete-ValidatedProxyProvider-GetProxyAsync'></a> +### GetProxyAsync() `method` + +##### Summary + +*Inherit from parent.* + +##### Parameters + +This method has no parameters. diff --git a/WebReaper/API.xml b/WebReaper/API.xml index e9536b2..3bd719a 100644 --- a/WebReaper/API.xml +++ b/WebReaper/API.xml @@ -51,5 +51,160 @@ Logger </summary> </member> + <member name="T:WebReaper.Proxy.Abstract.IProxyProposalProvider"> + <summary> + Supplies a list of unvalidated proxies. + </summary> + </member> + <member name="M:WebReaper.Proxy.Abstract.IProxyProposalProvider.GetProxiesAsync(System.Threading.CancellationToken)"> + <summary> + Returns a list of potential proxies, which may or may not be valid. + </summary> + </member> + <member name="T:WebReaper.Proxy.Abstract.IProxyProposalValidator"> + <summary> + Validates a proposed proxy. + </summary> + </member> + <member name="M:WebReaper.Proxy.Abstract.IProxyProposalValidator.ValidateAsync(System.Net.WebProxy,System.Threading.CancellationToken)"> + <summary> + Validates a proposed proxy. + </summary> + <returns>A <see cref="T:WebReaper.Proxy.Concrete.ProxyProposalValidationResult"/> indicating whether the proxy is valid or invalid, or the validator does not apply to the result.</returns> + </member> + <member name="T:WebReaper.Proxy.Abstract.IProxyProvider"> + <summary> + Provides a validated proxy. + </summary> + </member> + <member name="M:WebReaper.Proxy.Abstract.IProxyProvider.GetProxyAsync"> + <summary> + Returns a validated proxy. + </summary> + </member> + <member name="T:WebReaper.Proxy.Abstract.IValidatedProxyListProvider"> + <summary> + Supplies a list of validated, ready to use proxies. + </summary> + </member> + <member name="M:WebReaper.Proxy.Abstract.IValidatedProxyListProvider.GetProxiesAsync(System.Threading.CancellationToken)"> + <summary> + Returns a list of validated proxies. + </summary> + </member> + <member name="T:WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions"> + <summary> + Options for <see cref="T:WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator"/>. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions.ProbeUrl"> + <summary> + The URL to visit to validate the proxy. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions.ProbeTimeout"> + <summary> + The maximum time to wait for a response from the probe URL. + </summary> + </member> + <member name="T:WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator"> + <summary> + Validates a proxy by requesting a URL and waiting for a response. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator.#ctor(Microsoft.Extensions.Options.IOptions{WebReaper.Proxy.Concrete.PingTimeoutValidatorOptions})"> + <summary> + Initializes a new instance of the <see cref="T:WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator"/> class. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.PingTimeoutProxyProposalValidator.ValidateAsync(System.Net.WebProxy,System.Threading.CancellationToken)"> + <inheritdoc/> + </member> + <member name="T:WebReaper.Proxy.Concrete.ProxyProposalValidationResult"> + <summary> + The result of validating a proxy. + </summary> + <remarks> + Either <see cref="P:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsValid"/> or <see cref="P:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsInvalid"/> will be <c>true</c> when initialized. + </remarks> + </member> + <member name="F:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Default"> + <summary> + A default result. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Valid"> + <summary> + A valid result. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Invalid(System.Exception)"> + <summary> + An invalid result, with an error. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsDefault"> + <summary> + Whether the result is the default result. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsValid"> + <summary> + Whether the result is valid. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.IsInvalid"> + <summary> + Whether the result is invalid. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.ProxyProposalValidationResult.Error"> + <summary> + The error, if any. + </summary> + </member> + <member name="T:WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions"> + <summary> + Options for <see cref="T:WebReaper.Proxy.Concrete.ProxyProposalValidatorService"/>. + </summary> + </member> + <member name="P:WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions.ValidationInterval"> + <summary> + The interval at which to validate proxies. + </summary> + </member> + <member name="T:WebReaper.Proxy.Concrete.ProxyProposalValidatorService"> + <summary> + Periodically validates proxies and supplies a the most recently validated list of proxies. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.ProxyProposalValidatorService.#ctor(Microsoft.Extensions.Options.IOptions{WebReaper.Proxy.Concrete.ProxyProposalValidatorServiceOptions},Microsoft.Extensions.Logging.ILogger{WebReaper.Proxy.Concrete.ProxyProposalValidatorService},System.Collections.Generic.IEnumerable{WebReaper.Proxy.Abstract.IProxyProposalProvider},System.Collections.Generic.IEnumerable{WebReaper.Proxy.Abstract.IProxyProposalValidator})"> + <summary> + Periodically validates proxies and supplies a the most recently validated list of proxies. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.ProxyProposalValidatorService.GetProxiesAsync(System.Threading.CancellationToken)"> + <inheritdoc/> + </member> + <member name="M:WebReaper.Proxy.Concrete.ProxyProposalValidatorService.ExecuteAsync(System.Threading.CancellationToken)"> + <inheritdoc/> + </member> + <member name="T:WebReaper.Proxy.Concrete.ValidatedProxyProvider"> + <summary> + Provides a random validated proxy. + </summary> + <seealso cref="T:WebReaper.Proxy.Concrete.ProxyProposalValidatorService"/> + </member> + <member name="M:WebReaper.Proxy.Concrete.ValidatedProxyProvider.#ctor(WebReaper.Proxy.Abstract.IValidatedProxyListProvider)"> + <summary> + Initializes a new instance of the <see cref="T:WebReaper.Proxy.Concrete.ValidatedProxyProvider"/> class. + </summary> + </member> + <member name="M:WebReaper.Proxy.Concrete.ValidatedProxyProvider.GetProxyAsync(System.Threading.CancellationToken)"> + <inheritdoc/> + </member> + <member name="M:WebReaper.Proxy.Concrete.ValidatedProxyProvider.GetProxyAsync"> + <inheritdoc/> + </member> </members> </doc> diff --git a/WebReaper/Extensions/EnumerableExtensions.cs b/WebReaper/Extensions/EnumerableExtensions.cs new file mode 100644 index 0000000..fc54fbd --- /dev/null +++ b/WebReaper/Extensions/EnumerableExtensions.cs @@ -0,0 +1,66 @@ +using System; + +namespace WebReaper.Extensions; + +internal static class EnumerableExtensions +{ + public static IEnumerable<U> SelectTruthy<T, U>(this IEnumerable<T> enumerable, Func<T, U?> predicate) + where U : class + { + foreach (var item in enumerable) + { + if (predicate(item) is { } result) + { + yield return result; + } + } + } + + public static IEnumerable<T> SelectTruthy<T>(this IEnumerable<T?> enumerable) + { + foreach (var item in enumerable) + { + if (item is { } result) + { + yield return result; + } + } + } + + public static IEnumerable<U> SelectTruthy<T, U>(this IEnumerable<T> enumerable, Func<T, U?> predicate) where U : struct + { + foreach (var item in enumerable) + { + if (predicate(item) is { } result) + { + yield return result; + } + } + } + + public static IEnumerable<T> SelectTruthy<T>(this IEnumerable<T?> enumerable) where T : struct + { + foreach (var item in enumerable) + { + if (item is { } result) + { + yield return result; + } + } + } + + public static T ChooseRandom<T>(this IEnumerable<T> enumerable, Random? random = null) + { + random ??= Random.Shared; + if (enumerable.TryGetNonEnumeratedCount(out var count)) + { + var index = random.Next(count); + return enumerable.ElementAt(index); + } + else + { + var list = enumerable.ToList(); + return list[random.Next(list.Count)]; + } + } +} diff --git a/WebReaper/Proxy/Abstract/IProxyProposalProvider.cs b/WebReaper/Proxy/Abstract/IProxyProposalProvider.cs new file mode 100644 index 0000000..b1d7852 --- /dev/null +++ b/WebReaper/Proxy/Abstract/IProxyProposalProvider.cs @@ -0,0 +1,15 @@ +using System; +using System.Net; + +namespace WebReaper.Proxy.Abstract; + +/// <summary> +/// Supplies a list of unvalidated proxies. +/// </summary> +public interface IProxyProposalProvider +{ + /// <summary> + /// Returns a list of potential proxies, which may or may not be valid. + /// </summary> + Task<IEnumerable<WebProxy>> GetProxiesAsync(CancellationToken cancellationToken = default); +} diff --git a/WebReaper/Proxy/Abstract/IProxyProposalValidator.cs b/WebReaper/Proxy/Abstract/IProxyProposalValidator.cs new file mode 100644 index 0000000..dc4a63e --- /dev/null +++ b/WebReaper/Proxy/Abstract/IProxyProposalValidator.cs @@ -0,0 +1,17 @@ +using System; +using System.Net; +using WebReaper.Proxy.Concrete; + +namespace WebReaper.Proxy.Abstract; + +/// <summary> +/// Validates a proposed proxy. +/// </summary> +public interface IProxyProposalValidator +{ + /// <summary> + /// Validates a proposed proxy. + /// </summary> + /// <returns>A <see cref="ProxyProposalValidationResult"/> indicating whether the proxy is valid or invalid, or the validator does not apply to the result.</returns> + Task<ProxyProposalValidationResult> ValidateAsync(WebProxy proxy, CancellationToken cancellationToken = default); +} diff --git a/WebReaper/Proxy/Abstract/IProxyProvider.cs b/WebReaper/Proxy/Abstract/IProxyProvider.cs index 62e8eab..173e23f 100644 --- a/WebReaper/Proxy/Abstract/IProxyProvider.cs +++ b/WebReaper/Proxy/Abstract/IProxyProvider.cs @@ -2,7 +2,13 @@ namespace WebReaper.Proxy.Abstract; +/// <summary> +/// Provides a validated proxy. +/// </summary> public interface IProxyProvider { + /// <summary> + /// Returns a validated proxy. + /// </summary> Task<WebProxy> GetProxyAsync(); } \ No newline at end of file diff --git a/WebReaper/Proxy/Abstract/IValidatedProxyListProvider.cs b/WebReaper/Proxy/Abstract/IValidatedProxyListProvider.cs new file mode 100644 index 0000000..e769008 --- /dev/null +++ b/WebReaper/Proxy/Abstract/IValidatedProxyListProvider.cs @@ -0,0 +1,15 @@ +using System; +using System.Net; + +namespace WebReaper.Proxy.Abstract; + +/// <summary> +/// Supplies a list of validated, ready to use proxies. +/// </summary> +public interface IValidatedProxyListProvider +{ + /// <summary> + /// Returns a list of validated proxies. + /// </summary> + Task<IEnumerable<WebProxy>> GetProxiesAsync(CancellationToken cancellationToken = default); +} diff --git a/WebReaper/Proxy/Concrete/PingTimeoutProxyProposalValidator.cs b/WebReaper/Proxy/Concrete/PingTimeoutProxyProposalValidator.cs new file mode 100644 index 0000000..3c32c90 --- /dev/null +++ b/WebReaper/Proxy/Concrete/PingTimeoutProxyProposalValidator.cs @@ -0,0 +1,74 @@ +using System; +using System.Net; +using Microsoft.Extensions.Options; +using WebReaper.Proxy.Abstract; + +namespace WebReaper.Proxy.Concrete; + +/// <summary> +/// Options for <see cref="PingTimeoutProxyProposalValidator"/>. +/// </summary> +public sealed class PingTimeoutValidatorOptions : IOptions<PingTimeoutValidatorOptions> +{ + /// <summary> + /// The URL to visit to validate the proxy. + /// </summary> + public Uri ProbeUrl { get; set; } = new("https://www.cloudflare.com/"); + /// <summary> + /// The maximum time to wait for a response from the probe URL. + /// </summary> + public TimeSpan ProbeTimeout { get; set; } = TimeSpan.FromSeconds(5); + PingTimeoutValidatorOptions IOptions<PingTimeoutValidatorOptions>.Value => this; +} + +/// <summary> +/// Validates a proxy by requesting a URL and waiting for a response. +/// </summary> +public sealed class PingTimeoutProxyProposalValidator : IProxyProposalValidator +{ + private readonly PingTimeoutValidatorOptions _options; + + /// <summary> + /// Initializes a new instance of the <see cref="PingTimeoutProxyProposalValidator"/> class. + /// </summary> + public PingTimeoutProxyProposalValidator(IOptions<PingTimeoutValidatorOptions> options) + { + _options = options.Value; + } + /// <inheritdoc/> + + public async Task<ProxyProposalValidationResult> ValidateAsync(WebProxy proxy, CancellationToken cancellationToken = default) + { + using HttpMessageHandler h = new HttpClientHandler + { + Proxy = proxy, + UseProxy = true + }; + using var client = new HttpClient(h, false) + { + Timeout = _options.ProbeTimeout + }; + try + { + var response = await client.GetAsync(_options.ProbeUrl, cancellationToken); + response.EnsureSuccessStatusCode(); + return ProxyProposalValidationResult.Valid(); + } + catch (AggregateException ex) + { + if (ex.InnerExceptions.All(ex => ex is OperationCanceledException)) + { + return default; + } + return ProxyProposalValidationResult.Invalid(ex); + } + catch (OperationCanceledException) + { + return default; + } + catch (Exception ex) + { + return ProxyProposalValidationResult.Invalid(ex); + } + } +} diff --git a/WebReaper/Proxy/Concrete/ProxyProposalValidationResult.cs b/WebReaper/Proxy/Concrete/ProxyProposalValidationResult.cs new file mode 100644 index 0000000..68ab7cf --- /dev/null +++ b/WebReaper/Proxy/Concrete/ProxyProposalValidationResult.cs @@ -0,0 +1,57 @@ +using System; + +namespace WebReaper.Proxy.Concrete; + +/// <summary> +/// The result of validating a proxy. +/// </summary> +/// <remarks> +/// Either <see cref="IsValid"/> or <see cref="IsInvalid"/> will be <c>true</c> when initialized. +/// </remarks> +public readonly struct ProxyProposalValidationResult +{ + private readonly Kind _kind; + + ProxyProposalValidationResult(Kind kind, Exception? error = null) + { + _kind = kind; + } + + /// <summary> + /// A default result. + /// </summary> + public static ProxyProposalValidationResult Default = new ProxyProposalValidationResult(Kind.Default); + + /// <summary> + /// A valid result. + /// </summary> + public static ProxyProposalValidationResult Valid() => new ProxyProposalValidationResult(Kind.Valid); + /// <summary> + /// An invalid result, with an error. + /// </summary> + public static ProxyProposalValidationResult Invalid(Exception error) => new ProxyProposalValidationResult(Kind.Invalid, error); + + /// <summary> + /// Whether the result is the default result. + /// </summary> + public bool IsDefault => _kind == Kind.Default; + /// <summary> + /// Whether the result is valid. + /// </summary> + public bool IsValid => _kind == Kind.Valid; + /// <summary> + /// Whether the result is invalid. + /// </summary> + public bool IsInvalid => _kind == Kind.Invalid; + /// <summary> + /// The error, if any. + /// </summary> + public Exception? Error { get; } + + enum Kind + { + Default, + Valid, + Invalid + } +} diff --git a/WebReaper/Proxy/Concrete/ProxyProsposalValidatorService.cs b/WebReaper/Proxy/Concrete/ProxyProsposalValidatorService.cs new file mode 100644 index 0000000..87a7ae1 --- /dev/null +++ b/WebReaper/Proxy/Concrete/ProxyProsposalValidatorService.cs @@ -0,0 +1,107 @@ +using System; +using System.Net; +using Microsoft.Extensions.Hosting; +using Microsoft.Extensions.Logging; +using Microsoft.Extensions.Options; +using WebReaper.Extensions; +using WebReaper.Proxy.Abstract; + +namespace WebReaper.Proxy.Concrete; + +/// <summary> +/// Options for <see cref="ProxyProposalValidatorService"/>. +/// </summary> +public sealed class ProxyProposalValidatorServiceOptions : IOptions<ProxyProposalValidatorServiceOptions> +{ + /// <summary> + /// The interval at which to validate proxies. + /// </summary> + public TimeSpan ValidationInterval { get; set; } = TimeSpan.FromMinutes(2); + ProxyProposalValidatorServiceOptions IOptions<ProxyProposalValidatorServiceOptions>.Value => this; +} + +/// <summary> +/// Periodically validates proxies and supplies a the most recently validated list of proxies. +/// </summary> +public sealed class ProxyProposalValidatorService : BackgroundService, IValidatedProxyListProvider +{ + private readonly ProxyProposalValidatorServiceOptions _options; + private readonly ILogger<ProxyProposalValidatorService> _logger; + private readonly IEnumerable<IProxyProposalProvider> _proxySuppliers; + private readonly IEnumerable<IProxyProposalValidator> _proxyValidators; + private TaskCompletionSource<IEnumerable<WebProxy>> _proxiesCompletion = new(); + + /// <summary> + /// Periodically validates proxies and supplies a the most recently validated list of proxies. + /// </summary> + public ProxyProposalValidatorService( + IOptions<ProxyProposalValidatorServiceOptions> options, + ILogger<ProxyProposalValidatorService> logger, + IEnumerable<IProxyProposalProvider> proxySuppliers, + IEnumerable<IProxyProposalValidator> proxyValidators + ) + { + _options = options.Value; + _logger = logger; + _proxySuppliers = proxySuppliers; + _proxyValidators = proxyValidators; + } + + /// <inheritdoc/> + public Task<IEnumerable<WebProxy>> GetProxiesAsync(CancellationToken cancellationToken = default) + { + return _proxiesCompletion.Task.WaitAsync(cancellationToken); + } + + /// <inheritdoc/> + protected override async Task ExecuteAsync(CancellationToken stoppingToken) + { + while (!stoppingToken.IsCancellationRequested) + { + var proxies = await Task.WhenAll(_proxySuppliers.Select(supplier => supplier.GetProxiesAsync(stoppingToken))); + var validatedProxies = await Task.WhenAll(proxies + .SelectMany(proxy => proxy) + .Select(proxy => FilterAvailableProxy(proxy, stoppingToken)) + ); + // update the completion source + UpdateValidatedProxies(validatedProxies.SelectTruthy()); + + await Task.Delay(_options.ValidationInterval, stoppingToken); + stoppingToken.ThrowIfCancellationRequested(); + } + } + + private void UpdateValidatedProxies(IEnumerable<WebProxy> validatedProxies) + { + // Try to set the uncompleted task + if (!_proxiesCompletion.TrySetResult(validatedProxies)) + { + // Replace the completed task with a new completed task + TaskCompletionSource<IEnumerable<WebProxy>> completion = new(); + completion.SetResult(validatedProxies); + _proxiesCompletion = completion; + } + } + + private async Task<WebProxy?> FilterAvailableProxy(WebProxy proxy, CancellationToken stoppingToken) + { + var result = await ValidateProxy(proxy, stoppingToken); + if (result.IsInvalid) + { + _logger.LogWarning(result.Error, "Proxy {proxy} is invalid", proxy.Address); + return null; + } + return proxy; + } + + private async Task<ProxyProposalValidationResult> ValidateProxy(WebProxy webProxy, CancellationToken cancellationToken = default) + { + var results = await Task.WhenAll(_proxyValidators.Select(async validator => await validator.ValidateAsync(webProxy, cancellationToken))); + if (results.All(x => !x.IsInvalid)) + { + return ProxyProposalValidationResult.Valid(); + } + AggregateException error = new("No valid proxy found", results.SelectTruthy(x => x.Error)); + return ProxyProposalValidationResult.Invalid(error); + } +} diff --git a/WebReaper/Proxy/Concrete/ValidatedProxyProvider.cs b/WebReaper/Proxy/Concrete/ValidatedProxyProvider.cs new file mode 100644 index 0000000..bc8ed5d --- /dev/null +++ b/WebReaper/Proxy/Concrete/ValidatedProxyProvider.cs @@ -0,0 +1,36 @@ +using System; +using System.Net; +using WebReaper.Extensions; +using WebReaper.Proxy.Abstract; + +namespace WebReaper.Proxy.Concrete; + +/// <summary> +/// Provides a random validated proxy. +/// </summary> +/// <seealso cref="ProxyProposalValidatorService"/> +public sealed class ValidatedProxyProvider : IProxyProvider +{ + private readonly IValidatedProxyListProvider _validatedProxySource; + + /// <summary> + /// Initializes a new instance of the <see cref="ValidatedProxyProvider"/> class. + /// </summary> + public ValidatedProxyProvider(IValidatedProxyListProvider validatedProxySource) + { + _validatedProxySource = validatedProxySource; + } + + /// <inheritdoc/> + public async Task<WebProxy> GetProxyAsync(CancellationToken cancellationToken = default) + { + var proxies = await _validatedProxySource.GetProxiesAsync(cancellationToken); + return proxies.ChooseRandom(); + } + + /// <inheritdoc/> + public Task<WebProxy> GetProxyAsync() + { + return GetProxyAsync(default); + } +} diff --git a/WebReaper/WebReaper.csproj b/WebReaper/WebReaper.csproj index 349921d..f5fb4c2 100644 --- a/WebReaper/WebReaper.csproj +++ b/WebReaper/WebReaper.csproj @@ -42,6 +42,8 @@ <ItemGroup> <PackageReference Include="AngleSharp" Version="1.0.4" /> <PackageReference Include="Microsoft.Azure.Cosmos" Version="3.35.2" /> + <PackageReference Include="Microsoft.Extensions.Configuration.Abstractions" Version="7.0.0" /> + <PackageReference Include="Microsoft.Extensions.Hosting" Version="7.0.1" /> <PackageReference Include="Microsoft.Extensions.Http" Version="7.0.0" /> <PackageReference Include="MongoDB.Driver" Version="2.20.0" /> <PackageReference Include="Newtonsoft.Json" Version="13.0.3" />