Skip to content

Commit 0c71327

Browse files
committed
Parse Crawl-delay as TimeSpan
1 parent 76f2184 commit 0c71327

4 files changed

Lines changed: 19 additions & 18 deletions

File tree

src/Robots.Txt.Parser/Http/RobotWebClient.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@ public async Task<IRobotsTxt> LoadRobotsTxtAsync(Uri url, CancellationToken canc
4141
If a server status code indicates that the robots.txt file is unavailable to the crawler,
4242
then the crawler MAY access any resources on the server.
4343
*/
44-
return new RobotsTxt(this, baseUrl, new Dictionary<ProductToken, HashSet<UrlRule>>(), new Dictionary<ProductToken, int>(), null, new HashSet<Uri>());
44+
return new RobotsTxt(this, baseUrl, new Dictionary<ProductToken, HashSet<UrlRule>>(), new Dictionary<ProductToken, TimeSpan>(), null, new HashSet<Uri>());
4545
}
4646

4747
if (statusCodeNumber >= 500)
@@ -55,7 +55,7 @@ the 500-599 range.
5555
{
5656
{ ProductToken.Wildcard, new HashSet<UrlRule> { new (RuleType.Disallow, "/") } }
5757
};
58-
return new RobotsTxt(this, baseUrl, userAgentRules, new Dictionary<ProductToken, int>(), null, new HashSet<Uri>());
58+
return new RobotsTxt(this, baseUrl, userAgentRules, new Dictionary<ProductToken, TimeSpan>(), null, []);
5959
}
6060

6161
await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken);

src/Robots.Txt.Parser/RobotsTxt.cs

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,9 @@ public interface IRobotsTxt
2323
/// Retrieves the crawl delay specified for a User-Agent
2424
/// </summary>
2525
/// <param name="userAgent">User-Agent header to retrieve rules for</param>
26-
/// <param name="crawlDelay">The crawl delay in seconds</param>
26+
/// <param name="crawlDelay">The crawl delay duration if the directive was found; otherwise, <see cref="TimeSpan.Zero"/></param>
2727
/// <returns>True if a crawl delay directive exists; otherwise false</returns>
28-
bool TryGetCrawlDelay(ProductToken userAgent, out int crawlDelay);
28+
bool TryGetCrawlDelay(ProductToken userAgent, out TimeSpan crawlDelay);
2929

3030
/// <summary>
3131
/// Retrieves the website host
@@ -52,15 +52,15 @@ public class RobotsTxt : IRobotsTxt
5252
private readonly Uri _baseUrl;
5353

5454
private readonly IReadOnlyDictionary<ProductToken, HashSet<UrlRule>> _userAgentRules;
55-
private readonly IReadOnlyDictionary<ProductToken, int> _userAgentCrawlDirectives;
55+
private readonly IReadOnlyDictionary<ProductToken, TimeSpan> _userAgentCrawlDirectives;
5656
private readonly HashSet<ProductToken> _userAgents;
5757
private readonly string? _host;
5858
private readonly HashSet<Uri> _sitemapUrls;
5959

6060
internal RobotsTxt(IRobotClient client,
6161
Uri baseUrl,
6262
IReadOnlyDictionary<ProductToken, HashSet<UrlRule>> userAgentRules,
63-
IReadOnlyDictionary<ProductToken, int> userAgentCrawlDirectives,
63+
IReadOnlyDictionary<ProductToken, TimeSpan> userAgentCrawlDirectives,
6464
string? host,
6565
HashSet<Uri> sitemapUrls)
6666
{
@@ -87,7 +87,7 @@ public async IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSin
8787
}
8888

8989
/// <inheritdoc />
90-
public bool TryGetCrawlDelay(ProductToken userAgent, out int crawlDelay)
90+
public bool TryGetCrawlDelay(ProductToken userAgent, out TimeSpan crawlDelay)
9191
{
9292
var userAgentMatch = _userAgentCrawlDirectives.TryGetValue(userAgent, out crawlDelay);
9393
if (!userAgentMatch)

src/Robots.Txt.Parser/RobotsTxtParser.cs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -49,7 +49,7 @@ Crawlers MUST use case-insensitive matching to find the group that matches the p
4949
*/
5050
var currentUserAgents = new HashSet<ProductToken>();
5151
var userAgentRules = new Dictionary<ProductToken, HashSet<UrlRule>>();
52-
var userAgentCrawlDirectives = new Dictionary<ProductToken, int>();
52+
var userAgentCrawlDirectives = new Dictionary<ProductToken, TimeSpan>();
5353

5454
try
5555
{
@@ -107,7 +107,7 @@ The file MUST be UTF-8 encoded
107107
var crawlDelayValue = GetValueOfDirective(line, CrawlDelayDirective);
108108
if (int.TryParse(crawlDelayValue, out var parsedCrawlDelay))
109109
{
110-
foreach (var userAgent in currentUserAgents) userAgentCrawlDirectives.TryAdd(userAgent, parsedCrawlDelay);
110+
foreach (var userAgent in currentUserAgents) userAgentCrawlDirectives.TryAdd(userAgent, TimeSpan.FromSeconds(parsedCrawlDelay));
111111
}
112112
}
113113
}

tests/Robots.Txt.Parser.Tests.Unit/RobotTxtCrawlDelayTests.cs

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
using System;
12
using System.IO;
23
using System.Text;
34
using System.Threading.Tasks;
@@ -24,7 +25,7 @@ public async Task NoMatchedRules_CrawlDelayNotSpecified_DefaultCrawlDelay()
2425
// Assert
2526
robotsTxt.Should().NotBe(null);
2627
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(false);
27-
crawlDelay.Should().Be(0);
28+
crawlDelay.Should().Be(TimeSpan.Zero);
2829
}
2930

3031
[Fact]
@@ -43,7 +44,7 @@ public async Task WildcardUserAgent_CrawlDelayNotSpecified_DefaultCrawlDelay()
4344
// Assert
4445
robotsTxt.Should().NotBe(null);
4546
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(false);
46-
crawlDelay.Should().Be(0);
47+
crawlDelay.Should().Be(TimeSpan.Zero);
4748
}
4849

4950
[Fact]
@@ -62,7 +63,7 @@ public async Task WildcardUserAgent_CrawlDelaySpecified_ReturnCrawlDelay()
6263
// Assert
6364
robotsTxt.Should().NotBe(null);
6465
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(true);
65-
crawlDelay.Should().Be(10);
66+
crawlDelay.Should().Be(TimeSpan.FromSeconds(10));
6667
}
6768

6869
[Fact]
@@ -81,7 +82,7 @@ public async Task WildcardUserAgent_NonStandardCaseCrawlDelaySpecified_ReturnCra
8182
// Assert
8283
robotsTxt.Should().NotBe(null);
8384
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(true);
84-
crawlDelay.Should().Be(10);
85+
crawlDelay.Should().Be(TimeSpan.FromSeconds(10));
8586
}
8687

8788
[Fact]
@@ -103,7 +104,7 @@ public async Task MatchedUserAgent_NoCrawlDelaySpecified_DefaultCrawlDelay()
103104
// Assert
104105
robotsTxt.Should().NotBe(null);
105106
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(false);
106-
crawlDelay.Should().Be(0);
107+
crawlDelay.Should().Be(TimeSpan.Zero);
107108
}
108109

109110
[Fact]
@@ -125,7 +126,7 @@ public async Task MatchedUserAgent_CrawlDelaySpecified_ReturnCrawlDelay()
125126
// Assert
126127
robotsTxt.Should().NotBe(null);
127128
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(true);
128-
crawlDelay.Should().Be(5);
129+
crawlDelay.Should().Be(TimeSpan.FromSeconds(5));
129130
}
130131

131132
[Fact]
@@ -148,7 +149,7 @@ public async Task MatchedMultiLineUserAgent_NoCrawlDelaySpecified_DefaultCrawlDe
148149
// Assert
149150
robotsTxt.Should().NotBe(null);
150151
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(false);
151-
crawlDelay.Should().Be(0);
152+
crawlDelay.Should().Be(TimeSpan.Zero);
152153
}
153154

154155
[Fact]
@@ -171,7 +172,7 @@ public async Task MatchedMultiLineUserAgent_CrawlDelaySpecified_ReturnCrawlDelay
171172
// Assert
172173
robotsTxt.Should().NotBe(null);
173174
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(true);
174-
crawlDelay.Should().Be(5);
175+
crawlDelay.Should().Be(TimeSpan.FromSeconds(5));
175176
}
176177

177178
[Fact]
@@ -196,6 +197,6 @@ public async Task MatchedDuplicateGroupUserAgent_CrawlDelaySpecified_ReturnFirst
196197
// Assert
197198
robotsTxt.Should().NotBe(null);
198199
robotsTxt.TryGetCrawlDelay(ProductToken.Parse("SomeBot"), out var crawlDelay).Should().Be(true);
199-
crawlDelay.Should().Be(15);
200+
crawlDelay.Should().Be(TimeSpan.FromSeconds(15));
200201
}
201202
}

0 commit comments

Comments
 (0)