Skip to content

Commit a46cb6d

Browse files
committed
feat: ability to filter sitemaps by location
1 parent 0c71327 commit a46cb6d

4 files changed

Lines changed: 133 additions & 16 deletions

File tree

src/Robots.Txt.Parser/Http/RobotWebClient.cs

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -62,7 +62,7 @@ the 500-599 range.
6262
return await new RobotsTxtParser(this, baseUrl).ReadFromStreamAsync(stream, cancellationToken);
6363
}
6464

65-
async IAsyncEnumerable<UrlSetItem> IRobotClient.LoadSitemapsAsync(Uri uri, DateTime? modifiedSince, [EnumeratorCancellation] CancellationToken cancellationToken)
65+
async IAsyncEnumerable<UrlSetItem> IRobotClient.LoadSitemapsAsync(Uri uri, DateTime? modifiedSince, Func<Uri, bool>? sitemapLocationFilter, [EnumeratorCancellation] CancellationToken cancellationToken)
6666
{
6767
var request = new HttpRequestMessage(HttpMethod.Get, uri);
6868
request.Headers.Add("Accept", "application/xml,text/plain,text/xml,*/*");
@@ -86,7 +86,8 @@ async IAsyncEnumerable<UrlSetItem> IRobotClient.LoadSitemapsAsync(Uri uri, DateT
8686
{
8787
await foreach (var location in index.SitemapUris)
8888
{
89-
await foreach (var item in (this as IRobotClient).LoadSitemapsAsync(location, modifiedSince, cancellationToken))
89+
if (sitemapLocationFilter is not null && !sitemapLocationFilter.Invoke(location)) continue;
90+
await foreach (var item in (this as IRobotClient).LoadSitemapsAsync(location, modifiedSince, sitemapLocationFilter, cancellationToken))
9091
{
9192
yield return item;
9293
}

src/Robots.Txt.Parser/IRobotClient.cs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,5 +17,5 @@ public interface IRobotClient
1717
/// <exception cref="HttpRequestException">Thrown if a status code that cannot be handled is returned.</exception>
1818
Task<IRobotsTxt> LoadRobotsTxtAsync(Uri url, CancellationToken cancellationToken = default);
1919

20-
protected internal IAsyncEnumerable<UrlSetItem> LoadSitemapsAsync(Uri uri, DateTime? modifiedSince = null, CancellationToken cancellationToken = default);
20+
protected internal IAsyncEnumerable<UrlSetItem> LoadSitemapsAsync(Uri uri, DateTime? modifiedSince = null, Func<Uri, bool>? sitemapLocationFilter = null, CancellationToken cancellationToken = default);
2121
}

src/Robots.Txt.Parser/RobotsTxt.cs

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -14,10 +14,11 @@ public interface IRobotsTxt
1414
/// <summary>
1515
/// Retrieves the sitemap
1616
/// </summary>
17-
/// <param name="modifiedSince">Filter to retrieve site maps modified after this date</param>
17+
/// <param name="modifiedSince">Filter to retrieve sitemaps modified after this date</param>
18+
/// <param name="sitemapLocationFilter">Predicate to decide whether to retrieve sitemap based on location</param>
1819
/// <param name="cancellationToken">Cancellation token</param>
1920
/// <returns>A sitemap, or null or no sitemap is found</returns>
20-
IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSince = default, CancellationToken cancellationToken = default);
21+
IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSince = default, Func<Uri, bool>? sitemapLocationFilter = null, CancellationToken cancellationToken = default);
2122

2223
/// <summary>
2324
/// Retrieves the crawl delay specified for a User-Agent
@@ -74,12 +75,13 @@ internal RobotsTxt(IRobotClient client,
7475
}
7576

7677
/// <inheritdoc />
77-
public async IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSince = default, [EnumeratorCancellation] CancellationToken cancellationToken = default)
78+
public async IAsyncEnumerable<UrlSetItem> LoadSitemapAsync(DateTime? modifiedSince = default, Func<Uri, bool>? sitemapLocationFilter = null, [EnumeratorCancellation] CancellationToken cancellationToken = default)
7879
{
7980
var urls = _sitemapUrls.Count != 0 ? _sitemapUrls.AsEnumerable() : [new Uri(_baseUrl, "/sitemap.xml")];
8081
foreach (var url in urls)
8182
{
82-
await foreach (var item in _client.LoadSitemapsAsync(url, modifiedSince, cancellationToken))
83+
if (sitemapLocationFilter is not null && !sitemapLocationFilter.Invoke(url)) continue;
84+
await foreach (var item in _client.LoadSitemapsAsync(url, modifiedSince, sitemapLocationFilter, cancellationToken))
8385
{
8486
yield return item;
8587
}

tests/Robots.Txt.Parser.Tests.Unit/RobotTxtSitemapTests.cs

Lines changed: 123 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ public async Task LoadSitemapAsync_SitemapDirectiveExists_LoadSitemapDirective()
2424
";
2525
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
2626

27-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
27+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
2828
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
2929

3030
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
@@ -37,6 +37,7 @@ public async Task LoadSitemapAsync_SitemapDirectiveExists_LoadSitemapDirective()
3737
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
3838
new Uri("https://www.github.com/sitemap.xml"),
3939
null,
40+
null,
4041
default), Times.Once);
4142
}
4243

@@ -53,7 +54,7 @@ public async Task LoadSitemapAsync_MultipleSitemapDirectivesTopOfFile_LoadMultip
5354
";
5455
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
5556

56-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
57+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
5758
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
5859

5960
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
@@ -66,10 +67,12 @@ public async Task LoadSitemapAsync_MultipleSitemapDirectivesTopOfFile_LoadMultip
6667
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
6768
new Uri("https://www.github.com/sitemap.xml"),
6869
null,
70+
null,
6971
default), Times.Once);
7072
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
7173
new Uri("https://www.github.com/sitemap-2.xml"),
7274
null,
75+
null,
7376
default), Times.Once);
7477
}
7578

@@ -85,7 +88,7 @@ public async Task LoadSitemapAsync_MultipleSitemapDirectivesUnderUserAgent_LoadM
8588
Sitemap: https://www.github.com/sitemap-2.xml";
8689
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
8790

88-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
91+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
8992
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
9093

9194
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
@@ -98,10 +101,12 @@ public async Task LoadSitemapAsync_MultipleSitemapDirectivesUnderUserAgent_LoadM
98101
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
99102
new Uri("https://www.github.com/sitemap.xml"),
100103
null,
104+
null,
101105
default), Times.Once);
102106
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
103107
new Uri("https://www.github.com/sitemap-2.xml"),
104108
null,
109+
null,
105110
default), Times.Once);
106111
}
107112

@@ -118,7 +123,7 @@ public async Task LoadSitemapAsync_MultipleSitemapDirectives_RetrieveOneIfDuplic
118123
";
119124
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
120125

121-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
126+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
122127
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
123128

124129
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
@@ -131,6 +136,50 @@ public async Task LoadSitemapAsync_MultipleSitemapDirectives_RetrieveOneIfDuplic
131136
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
132137
new Uri("https://www.github.com/sitemap.xml"),
133138
null,
139+
null,
140+
default), Times.Once);
141+
}
142+
143+
[Fact]
144+
public async Task LoadSitemapAsync_MultipleSitemapDirectives_OnlyLoadDirectivesMatchingFilter()
145+
{
146+
// Arrange
147+
var file =
148+
@"Sitemap: https://www.github.com/sitemap-products.xml
149+
Sitemap: https://www.github.com/sitemap-categories.xml
150+
Sitemap: https://www.github.com/sitemap-brands.xml
151+
152+
User-agent: *
153+
Disallow: /
154+
";
155+
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
156+
157+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
158+
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
159+
160+
Func<Uri, bool> sitemapLocationFilter = location => location.AbsolutePath.Contains("brands");
161+
162+
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
163+
164+
// Act
165+
await robotsTxt.LoadSitemapAsync(sitemapLocationFilter: sitemapLocationFilter).ToListAsync();
166+
167+
// Assert
168+
robotsTxt.Should().NotBe(null);
169+
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
170+
new Uri("https://www.github.com/sitemap-products.xml"),
171+
It.IsAny<DateTime?>(),
172+
It.IsAny<Func<Uri, bool>>(),
173+
It.IsAny<CancellationToken>()), Times.Never);
174+
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
175+
new Uri("https://www.github.com/sitemap-categories.xml"),
176+
It.IsAny<DateTime?>(),
177+
It.IsAny<Func<Uri, bool>>(),
178+
It.IsAny<CancellationToken>()), Times.Never);
179+
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
180+
new Uri("https://www.github.com/sitemap-brands.xml"),
181+
null,
182+
sitemapLocationFilter,
134183
default), Times.Once);
135184
}
136185

@@ -146,7 +195,7 @@ public async Task LoadSitemapAsync_SitemapDirectiveExists_PassModifiedDate()
146195
";
147196
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
148197

149-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
198+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
150199
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
151200

152201
var modifiedDate = new DateTime(2023, 01, 01);
@@ -161,6 +210,38 @@ public async Task LoadSitemapAsync_SitemapDirectiveExists_PassModifiedDate()
161210
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
162211
It.IsAny<Uri>(),
163212
modifiedDate,
213+
null,
214+
default), Times.Once);
215+
}
216+
217+
[Fact]
218+
public async Task LoadSitemapAsync_SitemapDirectiveExists_PassSitemapLocationFilter()
219+
{
220+
// Arrange
221+
var file =
222+
@"Sitemap: https://www.github.com/sitemap.xml
223+
224+
User-agent: *
225+
Disallow: /
226+
";
227+
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
228+
229+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
230+
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
231+
232+
Func<Uri, bool> sitemapLocationFilter = location => location.AbsolutePath == "/sitemap.xml" || location.AbsolutePath.Contains("product");
233+
234+
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
235+
236+
// Act
237+
await robotsTxt.LoadSitemapAsync(sitemapLocationFilter: sitemapLocationFilter).ToListAsync();
238+
239+
// Assert
240+
robotsTxt.Should().NotBe(null);
241+
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
242+
It.IsAny<Uri>(),
243+
null,
244+
sitemapLocationFilter,
164245
default), Times.Once);
165246
}
166247

@@ -176,7 +257,7 @@ public async Task LoadSitemapAsync_SitemapDirectiveExists_PassCancellationToken(
176257
";
177258
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
178259

179-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
260+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
180261
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
181262

182263
using var cancellationTokenSource = new CancellationTokenSource();
@@ -192,6 +273,7 @@ public async Task LoadSitemapAsync_SitemapDirectiveExists_PassCancellationToken(
192273
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
193274
It.IsAny<Uri>(),
194275
null,
276+
null,
195277
cancellationToken), Times.Once);
196278
}
197279

@@ -205,7 +287,7 @@ public async Task LoadSitemapAsync_NoSitemapDirective_TryLoadDefaultSitemapIfNon
205287
";
206288
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
207289

208-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
290+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
209291
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
210292

211293
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
@@ -218,6 +300,7 @@ public async Task LoadSitemapAsync_NoSitemapDirective_TryLoadDefaultSitemapIfNon
218300
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
219301
new Uri("https://www.github.com/sitemap.xml"),
220302
null,
303+
null,
221304
default), Times.Once);
222305
}
223306

@@ -231,7 +314,7 @@ public async Task LoadSitemapAsync_NoSitemapDirective_PassModifiedDate()
231314
";
232315
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
233316

234-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
317+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
235318
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
236319

237320
var modifiedDate = new DateTime(2023, 01, 01);
@@ -246,6 +329,36 @@ public async Task LoadSitemapAsync_NoSitemapDirective_PassModifiedDate()
246329
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
247330
It.IsAny<Uri>(),
248331
modifiedDate,
332+
null,
333+
default), Times.Once);
334+
}
335+
336+
[Fact]
337+
public async Task LoadSitemapAsync_NoSitemapDirective_PassSitemapLocationFilter()
338+
{
339+
// Arrange
340+
var file =
341+
@"User-agent: *
342+
Disallow: /
343+
";
344+
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
345+
346+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
347+
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
348+
349+
Func<Uri, bool> sitemapLocationFilter = location => location.AbsolutePath == "/sitemap.xml" || location.AbsolutePath.Contains("product");
350+
351+
var robotsTxt = await _parser.ReadFromStreamAsync(stream);
352+
353+
// Act
354+
await robotsTxt.LoadSitemapAsync(sitemapLocationFilter: sitemapLocationFilter).ToListAsync();
355+
356+
// Assert
357+
robotsTxt.Should().NotBe(null);
358+
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
359+
It.IsAny<Uri>(),
360+
null,
361+
sitemapLocationFilter,
249362
default), Times.Once);
250363
}
251364

@@ -259,7 +372,7 @@ public async Task LoadSitemapAsync_NoSitemapDirective_PassCancellationToken()
259372
";
260373
await using var stream = new MemoryStream(Encoding.UTF8.GetBytes(file));
261374

262-
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<CancellationToken>()))
375+
_robotsClientMock.Setup(callTo => callTo.LoadSitemapsAsync(It.IsAny<Uri>(), It.IsAny<DateTime?>(), It.IsAny<Func<Uri, bool>>(), It.IsAny<CancellationToken>()))
263376
.Returns(Enumerable.Empty<UrlSetItem>().ToAsyncEnumerable());
264377

265378
using var cancellationTokenSource = new CancellationTokenSource();
@@ -275,6 +388,7 @@ public async Task LoadSitemapAsync_NoSitemapDirective_PassCancellationToken()
275388
_robotsClientMock.Verify(callTo => callTo.LoadSitemapsAsync(
276389
It.IsAny<Uri>(),
277390
null,
391+
null,
278392
cancellationToken), Times.Once);
279393
}
280394
}

0 commit comments

Comments
 (0)