Skip to content

Commit a22adb0

Browse files
authored
Replace lone surrogates option (#2780)
1 parent 72d0541 commit a22adb0

File tree

7 files changed

+98
-17
lines changed

7 files changed

+98
-17
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
<!DOCTYPE html>
2+
<html lang="en">
3+
<head>
4+
<meta charset="UTF-8">
5+
<meta http-equiv="X-UA-Compatible" content="IE=edge">
6+
<meta name="viewport" content="width=device-width, initial-scale=1.0">
7+
<title>Lone Surrogate via JavaScript</title>
8+
</head>
9+
<body>
10+
<h1>Lone Surrogate Example</h1>
11+
<p id="surrogate-text">This paragraph will be updated with a lone surrogate.</p>
12+
13+
<script>
14+
// Create a lone surrogate
15+
var loneSurrogate = String.fromCharCode(0xD83D); // High surrogate without its pair
16+
document.getElementById('surrogate-text').textContent = "This paragraph contains a lone surrogate: " + loneSurrogate;
17+
</script>
18+
</body>
19+
</html>
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
using System;
2+
using System.Threading.Tasks;
3+
using NUnit.Framework;
4+
using PuppeteerSharp.Nunit;
5+
6+
namespace PuppeteerSharp.Tests.PageTests
7+
{
8+
public class GetContentTests : PuppeteerPageBaseTest
9+
{
10+
11+
[Test, Retry(2)]
12+
public async Task ShouldWorkWithLoneSurrogate()
13+
{
14+
await Page.GoToAsync(TestConstants.ServerUrl + "/lone-surrogate.html");
15+
var result = await Page.GetContentAsync(new GetContentOptions { ReplaceLoneSurrogates = true });
16+
17+
Assert.That(result, Contains.Substring("This paragraph contains a lone surrogate"));
18+
}
19+
}
20+
}

lib/PuppeteerSharp/Frame.cs

+16-14
Original file line numberDiff line numberDiff line change
@@ -205,22 +205,24 @@ public Task<DeviceRequestPrompt> WaitForDevicePromptAsync(WaitForOptions options
205205
public abstract Task<IElementHandle> AddScriptTagAsync(AddTagOptions options);
206206

207207
/// <inheritdoc/>
208-
public Task<string> GetContentAsync()
209-
=> EvaluateFunctionAsync<string>(@"() => {
210-
let content = '';
211-
for (const node of document.childNodes) {
212-
switch (node) {
213-
case document.documentElement:
214-
content += document.documentElement.outerHTML;
215-
break;
216-
default:
217-
content += new XMLSerializer().serializeToString(node);
218-
break;
208+
public Task<string> GetContentAsync(GetContentOptions options = null)
209+
=> EvaluateFunctionAsync<string>(
210+
@"(replaceLoneSurrogates) => {
211+
let content = '';
212+
for (const node of document.childNodes) {
213+
switch (node) {
214+
case document.documentElement:
215+
content += document.documentElement.outerHTML;
216+
break;
217+
default:
218+
content += new XMLSerializer().serializeToString(node);
219+
break;
220+
}
219221
}
220-
}
221222
222-
return content;
223-
}");
223+
return replaceLoneSurrogates ? content.toWellFormed() : content;
224+
}",
225+
options?.ReplaceLoneSurrogates ?? false);
224226

225227
/// <inheritdoc/>
226228
public abstract Task SetContentAsync(string html, NavigationOptions options = null);
+38
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,38 @@
1+
// * MIT License
2+
// *
3+
// * Copyright (c) Darío Kondratiuk
4+
// *
5+
// * Permission is hereby granted, free of charge, to any person obtaining a copy
6+
// * of this software and associated documentation files (the "Software"), to deal
7+
// * in the Software without restriction, including without limitation the rights
8+
// * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9+
// * copies of the Software, and to permit persons to whom the Software is
10+
// * furnished to do so, subject to the following conditions:
11+
// *
12+
// * The above copyright notice and this permission notice shall be included in all
13+
// * copies or substantial portions of the Software.
14+
// *
15+
// * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16+
// * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17+
// * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18+
// * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19+
// * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20+
// * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21+
// * SOFTWARE.
22+
23+
namespace PuppeteerSharp;
24+
25+
/// <summary>
26+
/// Options for <see cref="IPage.GetContentAsync(GetContentOptions)"/> and <see cref="IFrame.GetContentAsync(GetContentOptions)"/>.
27+
/// </summary>
28+
public class GetContentOptions
29+
{
30+
/// <summary>
31+
/// Replace lone surrogates with U+FFFD replacement character.
32+
/// </summary>
33+
/// <remarks>
34+
/// This functionality relies on the toWellFormed function. <a href="https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Global_Objects/String/toWellFormed">See.</a>.
35+
/// It's set to false by default to prevent extra processing when not needed.
36+
/// </remarks>
37+
public bool ReplaceLoneSurrogates { get; set; } = false;
38+
}

lib/PuppeteerSharp/IFrame.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -210,9 +210,10 @@ public interface IFrame
210210
/// <summary>
211211
/// Gets the full HTML contents of the page, including the doctype.
212212
/// </summary>
213+
/// <param name="options">Options.</param>
213214
/// <returns>Task which resolves to the HTML content.</returns>
214215
/// <seealso cref="IPage.GetContentAsync"/>
215-
Task<string> GetContentAsync();
216+
Task<string> GetContentAsync(GetContentOptions options = null);
216217

217218
/// <summary>
218219
/// Returns page's title.

lib/PuppeteerSharp/IPage.cs

+2-1
Original file line numberDiff line numberDiff line change
@@ -749,9 +749,10 @@ public interface IPage : IDisposable, IAsyncDisposable
749749
/// <summary>
750750
/// Gets the full HTML contents of the page, including the doctype.
751751
/// </summary>
752+
/// <param name="options">Options.</param>
752753
/// <returns>Task which resolves to the HTML content.</returns>
753754
/// <seealso cref="IFrame.GetContentAsync"/>
754-
Task<string> GetContentAsync();
755+
Task<string> GetContentAsync(GetContentOptions options = null);
755756

756757
/// <summary>
757758
/// Returns the page's cookies.

lib/PuppeteerSharp/Page.cs

+1-1
Original file line numberDiff line numberDiff line change
@@ -349,7 +349,7 @@ public Task ExposeFunctionAsync<T1, T2, T3, T4, TResult>(
349349
public abstract Task RemoveExposedFunctionAsync(string name);
350350

351351
/// <inheritdoc/>
352-
public Task<string> GetContentAsync() => MainFrame.GetContentAsync();
352+
public Task<string> GetContentAsync(GetContentOptions options = null) => MainFrame.GetContentAsync(options);
353353

354354
/// <inheritdoc/>
355355
public Task SetContentAsync(string html, NavigationOptions options = null)

0 commit comments

Comments
 (0)