Skip to content

Commit 6543857

Browse files
committed
Fix regression in read_html
Oh C string handling how I loathe you
1 parent b53cac7 commit 6543857

File tree

3 files changed

+16
-2
lines changed

3 files changed

+16
-2
lines changed

NEWS.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
# xml2 (development version)
22

3+
* `read_html()` now again works with HTML files with non-ASCII encodings (#293).
4+
35
# xml2 1.3.0
46

57
* Removes the Rcpp dependency

src/xml2_doc.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -194,13 +194,13 @@ extern "C" SEXP doc_parse_file(
194194
if (as_html) {
195195
pDoc = htmlReadFile(
196196
path,
197-
strncmp(encoding, "", 0) == 0 ? NULL : encoding,
197+
encoding[0] == '\0' ? NULL : encoding,
198198
options
199199
);
200200
} else {
201201
pDoc = xmlReadFile(
202202
path,
203-
strncmp(encoding, "", 0) == 0 ? NULL : encoding,
203+
encoding[0] == '\0' ? NULL : encoding,
204204
options
205205
);
206206
}

tests/testthat/test-read-xml.R

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -85,3 +85,15 @@ test_that("read_xml and read_html fail for bad status codes", {
8585
class = "http_404"
8686
)
8787
})
88+
89+
test_that("read_html works with non-ASCII encodings", {
90+
tmp <- tempfile()
91+
on.exit(unlink(tmp))
92+
93+
writeLines("<html><body>\U2019</body></html>", tmp)
94+
res <- read_html(tmp, encoding = "UTF-8")
95+
96+
expect_equal(as.character(res, options = ""),
97+
"<!DOCTYPE html PUBLIC \"-//W3C//DTD HTML 4.0 Transitional//EN\" \"http://www.w3.org/TR/REC-html40/loose.dtd\">\n<html><body>\U2019</body></html>\n")
98+
})
99+

0 commit comments

Comments
 (0)