diff --git a/CMakeLists.txt b/CMakeLists.txt index d309509d..ed1541fa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ cmake_dependent_option(PUGIXML_BUILD_SHARED_AND_STATIC_LIBS # Expose options from the pugiconfig.hpp option(PUGIXML_WCHAR_MODE "Enable wchar_t mode" OFF) +option(PUGIXML_CHAR8_MODE "Enable char8_t mode" OFF) option(PUGIXML_COMPACT "Enable compact mode" OFF) # Advanced options from pugiconfig.hpp @@ -51,6 +52,7 @@ endif() set(PUGIXML_PUBLIC_DEFINITIONS $<$:PUGIXML_WCHAR_MODE> + $<$:PUGIXML_CHAR8_MODE> $<$:PUGIXML_COMPACT> $<$:PUGIXML_NO_XPATH> $<$:PUGIXML_NO_STL> diff --git a/docs/manual.adoc b/docs/manual.adoc index ac515358..4e83e5b4 100644 --- a/docs/manual.adoc +++ b/docs/manual.adoc @@ -228,6 +228,8 @@ pugixml uses several defines to control the compilation process. There are two w [[PUGIXML_WCHAR_MODE]]`PUGIXML_WCHAR_MODE` define toggles between UTF-8 style interface (the in-memory text encoding is assumed to be UTF-8, most functions use `char` as character type) and UTF-16/32 style interface (the in-memory text encoding is assumed to be UTF-16/32, depending on `wchar_t` size, most functions use `wchar_t` as character type). See <> for more details. +[[PUGIXML_CHAR8_MODE]]`PUGIXML_CHAR8_MODE` define makes the UTF-8 style interface use `char8_t` instead of `char`. + [[PUGIXML_COMPACT]]`PUGIXML_COMPACT` define activates a different internal representation of document storage that is much more memory efficient for documents with a lot of markup (i.e. nodes and attributes), but is slightly slower to parse and access. For details see <>. [[PUGIXML_NO_XPATH]]`PUGIXML_NO_XPATH` define disables XPath. Both XPath interfaces and XPath implementation are excluded from compilation. This option is provided in case you do not need XPath functionality and need to save code space. @@ -399,7 +401,7 @@ Nodes and attributes do not exist without a document tree, so you can't create t [[dom.unicode]] === Unicode interface -There are two choices of interface and internal representation when configuring pugixml: you can either choose the UTF-8 (also called char) interface or UTF-16/32 (also called wchar_t) one. The choice is controlled via <> define; you can set it via `pugiconfig.hpp` or via preprocessor options, as discussed in <>. If this define is set, the wchar_t interface is used; otherwise (by default) the char interface is used. The exact wide character encoding is assumed to be either UTF-16 or UTF-32 and is determined based on the size of `wchar_t` type. +There are three choices of interface and internal representation when configuring pugixml: you can either choose the UTF-8 (also called char) interface or UTF-16/32 (also called wchar_t) one. The UTF-8 interface can either use char (the default) or char8_t. The choice is controlled via the <> and <> defines; you can set them via `pugiconfig.hpp` or via preprocessor options, as discussed in <>. If `PUGIXML_WCHAR_MODE` is set, the wchar_t interface is used; otherwise, if `PUGIXML_CHAR8_MODE` is set, the char8_t interface is used; otherwise (by default) the char interface is used. The exact wide character encoding is assumed to be either UTF-16 or UTF-32 and is determined based on the size of `wchar_t` type. NOTE: If the size of `wchar_t` is 2, pugixml assumes UTF-16 encoding instead of UCS-2, which means that some characters are represented as two code points. @@ -411,6 +413,14 @@ const char* xml_node::name() const; bool xml_node::set_name(const char* value); ---- +like this in char8_t mode: + +[source] +---- +const char8_t* xml_node::name() const; +bool xml_node::set_name(const char8_t* value); +---- + and like this in wchar_t mode: [source] @@ -420,7 +430,7 @@ bool xml_node::set_name(const wchar_t* value); ---- [[char_t]][[string_t]] -There is a special type, `pugi::char_t`, that is defined as the character type and depends on the library configuration; it will be also used in the documentation hereafter. There is also a type `pugi::string_t`, which is defined as the STL string of the character type; it corresponds to `std::string` in char mode and to `std::wstring` in wchar_t mode. +There is a special type, `pugi::char_t`, that is defined as the character type and depends on the library configuration; it will be also used in the documentation hereafter. There is also a type `pugi::string_t`, which is defined as the STL string of the character type; it corresponds to `std::string` in char mode, `std::u8string` in char8_t mode, and to `std::wstring` in wchar_t mode. In addition to the interface, the internal implementation changes to store XML data as `pugi::char_t`; this means that these two modes have different memory usage characteristics - generally UTF-8 mode is more memory and performance efficient, especially if `sizeof(wchar_t)` is 4. The conversion to `pugi::char_t` upon document loading and from `pugi::char_t` upon document saving happen automatically, which also carries minor performance penalty. The general advice however is to select the character mode based on usage scenario, i.e. if UTF-8 is inconvenient to process and most of your XML data is non-ASCII, wchar_t mode is probably a better choice. @@ -443,13 +453,15 @@ std::wstring as_wide(const std::string& str); [NOTE] ==== -Most examples in this documentation assume char interface and therefore will not compile with <>. This is done to simplify the documentation; usually the only changes you'll have to make is to pass `wchar_t` string literals, i.e. instead of +Most examples in this documentation assume char interface and therefore will not compile with <> or <>. This is done to simplify the documentation; usually the only changes you'll have to make is to pass the appropriate string literals, i.e. instead of `xml_node node = doc.child("bookstore").find_child_by_attribute("book", "id", "12345");` you'll have to use `xml_node node = doc.child(L"bookstore").find_child_by_attribute(L"book", L"id", L"12345");` + +in wchar_t mode. ==== [[dom.thread]] diff --git a/src/pugiconfig.hpp b/src/pugiconfig.hpp index 0713b0ef..379db305 100644 --- a/src/pugiconfig.hpp +++ b/src/pugiconfig.hpp @@ -17,6 +17,9 @@ // Uncomment this to enable wchar_t mode // #define PUGIXML_WCHAR_MODE +// Uncomment this to enable char8_t mode +//#define PUGIXML_CHAR8_MODE + // Uncomment this to enable compact mode // #define PUGIXML_COMPACT diff --git a/src/pugixml.cpp b/src/pugixml.cpp index 7c840385..18d8e914 100644 --- a/src/pugixml.cpp +++ b/src/pugixml.cpp @@ -217,6 +217,8 @@ PUGI__NS_BEGIN #ifdef PUGIXML_WCHAR_MODE return wcslen(s); + #elif defined(PUGIXML_CHAR8_MODE) + return strlen(reinterpret_cast(s)); #else return strlen(s); #endif @@ -229,6 +231,8 @@ PUGI__NS_BEGIN #ifdef PUGIXML_WCHAR_MODE return wcscmp(src, dst) == 0; + #elif defined(PUGIXML_CHAR8_MODE) + return strcmp(reinterpret_cast(src), reinterpret_cast(dst)) == 0; #else return strcmp(src, dst) == 0; #endif @@ -2300,7 +2304,7 @@ PUGI__NS_BEGIN return wchar_decoder::process(str, length, 0, utf8_counter()); } - PUGI__FN void as_utf8_end(char* buffer, size_t size, const wchar_t* str, size_t length) + PUGI__FN void as_utf8_end(u8char_t* buffer, size_t size, const wchar_t* str, size_t length) { // convert to utf8 uint8_t* begin = reinterpret_cast(buffer); @@ -2312,13 +2316,13 @@ PUGI__NS_BEGIN } #ifndef PUGIXML_NO_STL - PUGI__FN std::string as_utf8_impl(const wchar_t* str, size_t length) + PUGI__FN std::basic_string as_utf8_impl(const wchar_t* str, size_t length) { // first pass: get length in utf8 characters size_t size = as_utf8_begin(str, length); // allocate resulting string - std::string result; + std::basic_string result; result.resize(size); // second pass: convert to utf8 @@ -3503,7 +3507,7 @@ PUGI__NS_BEGIN #else static char_t* parse_skip_bom(char_t* s) { - return (s[0] == '\xef' && s[1] == '\xbb' && s[2] == '\xbf') ? s + 3 : s; + return (s[0] == char_t('\xef') && s[1] == char_t('\xbb') && s[2] == char_t('\xbf')) ? s + 3 : s; } #endif @@ -4607,6 +4611,8 @@ PUGI__NS_BEGIN { #ifdef PUGIXML_WCHAR_MODE return wcstod(value, 0); + #elif defined(PUGIXML_CHAR8_MODE) + return strtod(reinterpret_cast(value), 0); #else return strtod(value, 0); #endif @@ -4616,6 +4622,8 @@ PUGI__NS_BEGIN { #ifdef PUGIXML_WCHAR_MODE return static_cast(wcstod(value, 0)); + #elif defined(PUGIXML_CHAR8_MODE) + return static_cast(strtod(reinterpret_cast(value), 0)); #else return static_cast(strtod(value, 0)); #endif @@ -4674,6 +4682,8 @@ PUGI__NS_BEGIN for (; buf[offset]; ++offset) wbuf[offset] = buf[offset]; return strcpy_insitu(dest, header, header_mask, wbuf, offset); + #elif defined(PUGIXML_CHAR8_MODE) + return strcpy_insitu(dest, header, header_mask, reinterpret_cast(buf), strlen(reinterpret_cast(buf))); #else return strcpy_insitu(dest, header, header_mask, buf, strlen(buf)); #endif @@ -5104,12 +5114,24 @@ namespace pugi #ifndef PUGIXML_NO_STL PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(&stream), wide_stream(0) + #ifdef PUGIXML_CHAR8_MODE + , utf8_stream(0) + #endif { } PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(0), wide_stream(&stream) + #ifdef PUGIXML_CHAR8_MODE + , utf8_stream(0) + #endif + { + } + + #ifdef PUGIXML_CHAR8_MODE + PUGI__FN xml_writer_stream::xml_writer_stream(std::basic_ostream >& stream): narrow_stream(0), wide_stream(0), utf8_stream(&stream) { } + #endif PUGI__FN void xml_writer_stream::write(const void* data, size_t size) { @@ -5118,6 +5140,13 @@ namespace pugi assert(!wide_stream); narrow_stream->write(reinterpret_cast(data), static_cast(size)); } + #ifdef PUGIXML_CHAR8_MODE + else if (utf8_stream) + { + assert(!wide_stream); + utf8_stream->write(reinterpret_cast(data), static_cast(size)); + } + #endif else { assert(wide_stream); @@ -6492,6 +6521,15 @@ namespace pugi print(writer, indent, flags, encoding_wchar, depth); } + + #ifdef PUGIXML_CHAR8_MODE + PUGI__FN void xml_node::print(std::basic_ostream >& stream, const char_t* indent, unsigned int flags, unsigned int depth) const + { + xml_writer_stream writer(stream); + + print(writer, indent, flags, encoding_wchar, depth); + } + #endif #endif PUGI__FN ptrdiff_t xml_node::offset_debug() const @@ -7314,6 +7352,15 @@ namespace pugi return impl::load_stream_impl(static_cast(_root), stream, options, encoding_wchar, &_buffer); } + + #ifdef PUGIXML_CHAR8_MODE + PUGI__FN xml_parse_result xml_document::load(std::basic_istream >& stream, unsigned int options) + { + reset(); + + return impl::load_stream_impl(static_cast(_root), stream, options, encoding_utf8, &_buffer); + } + #endif #endif PUGI__FN xml_parse_result xml_document::load_string(const char_t* contents, unsigned int options) @@ -7416,6 +7463,15 @@ namespace pugi save(writer, indent, flags, encoding_wchar); } + + #ifdef PUGIXML_CHAR8_MODE + PUGI__FN void xml_document::save(std::basic_ostream >& stream, const char_t* indent, unsigned int flags) const + { + xml_writer_stream writer(stream); + + save(writer, indent, flags, encoding_wchar); + } + #endif #endif PUGI__FN bool xml_document::save_file(const char* path_, const char_t* indent, unsigned int flags, xml_encoding encoding) const @@ -7446,14 +7502,14 @@ namespace pugi } #ifndef PUGIXML_NO_STL - PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const wchar_t* str) + PUGI__FN std::basic_string PUGIXML_FUNCTION as_utf8(const wchar_t* str) { assert(str); return impl::as_utf8_impl(str, impl::strlength_wide(str)); } - PUGI__FN std::string PUGIXML_FUNCTION as_utf8(const std::basic_string& str) + PUGI__FN std::basic_string PUGIXML_FUNCTION as_utf8(const std::basic_string& str) { return impl::as_utf8_impl(str.c_str(), str.size()); } @@ -8094,6 +8150,9 @@ PUGI__NS_BEGIN { #ifdef PUGIXML_WCHAR_MODE return wcschr(s, c); + #elif defined(PUGIXML_CHAR8_MODE) + return reinterpret_cast( + strchr(reinterpret_cast(s), static_cast(c))); #else return strchr(s, c); #endif @@ -8104,6 +8163,9 @@ PUGI__NS_BEGIN #ifdef PUGIXML_WCHAR_MODE // MSVC6 wcsstr bug workaround (if s is empty it always returns 0) return (*p == 0) ? s : wcsstr(s, p); + #elif defined(PUGIXML_CHAR8_MODE) + return reinterpret_cast( + strstr(reinterpret_cast(s), reinterpret_cast(p))); #else return strstr(s, p); #endif @@ -8550,6 +8612,8 @@ PUGI__NS_BEGIN // parse string #ifdef PUGIXML_WCHAR_MODE return wcstod(string, 0); + #elif defined(PUGIXML_CHAR8_MODE) + return strtod(reinterpret_cast(string), 0); #else return strtod(string, 0); #endif diff --git a/src/pugixml.hpp b/src/pugixml.hpp index 398eec8d..199005de 100644 --- a/src/pugixml.hpp +++ b/src/pugixml.hpp @@ -122,10 +122,17 @@ # endif #endif +#if defined(PUGIXML_CHAR8_MODE) && !defined(__cpp_char8_t) +# error "char8_t mode requires C++20 or later" +#endif + // Character interface macros #ifdef PUGIXML_WCHAR_MODE # define PUGIXML_TEXT(t) L ## t # define PUGIXML_CHAR wchar_t +#elif defined(PUGIXML_CHAR8_MODE) +# define PUGIXML_TEXT(t) u8 ## t +# define PUGIXML_CHAR char8_t #else # define PUGIXML_TEXT(t) t # define PUGIXML_CHAR char @@ -136,6 +143,13 @@ namespace pugi // Character type used for all internal storage and operations; depends on PUGIXML_WCHAR_MODE typedef PUGIXML_CHAR char_t; + // Character type used for UTF-8; depends on PUGIXML_CHAR8_MODE +#ifdef PUGIXML_CHAR8_MODE + typedef char8_t u8char_t; +#else + typedef char u8char_t; +#endif + #ifndef PUGIXML_NO_STL // String type used for operations that work with STL string; depends on PUGIXML_WCHAR_MODE typedef std::basic_string, std::allocator > string_t; @@ -351,12 +365,18 @@ namespace pugi // Construct writer from an output stream object xml_writer_stream(std::basic_ostream >& stream); xml_writer_stream(std::basic_ostream >& stream); + #ifdef PUGIXML_CHAR8_MODE + xml_writer_stream(std::basic_ostream >& stream); + #endif virtual void write(const void* data, size_t size) PUGIXML_OVERRIDE; private: std::basic_ostream >* narrow_stream; std::basic_ostream >* wide_stream; + #ifdef PUGIXML_CHAR8_MODE + std::basic_ostream >* utf8_stream; + #endif }; #endif @@ -696,6 +716,9 @@ namespace pugi // Print subtree to stream void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto, unsigned int depth = 0) const; void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; + #ifdef PUGIXML_CHAR8_MODE + void print(std::basic_ostream >& os, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, unsigned int depth = 0) const; + #endif #endif // Child nodes iterators @@ -1064,6 +1087,9 @@ namespace pugi // Load document from stream. xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default, xml_encoding encoding = encoding_auto); xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); + #ifdef PUGIXML_CHAR8_MODE + xml_parse_result load(std::basic_istream >& stream, unsigned int options = parse_default); + #endif #endif // (deprecated: use load_string instead) Load document from zero-terminated string. No encoding conversions are applied. @@ -1094,6 +1120,9 @@ namespace pugi // Save XML document to stream (semantics is slightly different from xml_node::print, see documentation for details). void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default, xml_encoding encoding = encoding_auto) const; void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; + #ifdef PUGIXML_CHAR8_MODE + void save(std::basic_ostream >& stream, const char_t* indent = PUGIXML_TEXT("\t"), unsigned int flags = format_default) const; + #endif #endif // Save XML to file @@ -1429,8 +1458,8 @@ namespace pugi #ifndef PUGIXML_NO_STL // Convert wide string to UTF8 - std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); - std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const wchar_t* str); + std::basic_string, std::allocator > PUGIXML_FUNCTION as_utf8(const std::basic_string, std::allocator >& str); // Convert UTF8 to wide string std::basic_string, std::allocator > PUGIXML_FUNCTION as_wide(const char* str); diff --git a/tests/test.cpp b/tests/test.cpp index a97116ea..acfe24b3 100644 --- a/tests/test.cpp +++ b/tests/test.cpp @@ -44,12 +44,11 @@ static void build_document_order(std::vector& result, pugi::xm bool test_string_equal(const pugi::char_t* lhs, const pugi::char_t* rhs) { - return (!lhs || !rhs) ? lhs == rhs : - #ifdef PUGIXML_WCHAR_MODE - wcscmp(lhs, rhs) == 0; - #else - strcmp(lhs, rhs) == 0; - #endif + if (!lhs || !rhs) return lhs == rhs; + typedef std::char_traits traits; + const size_t lhs_len = traits::length(lhs); + const size_t rhs_len = traits::length(rhs); + return lhs_len == rhs_len && traits::compare(lhs, rhs, lhs_len) == 0; } bool test_node(const pugi::xml_node& node, const pugi::char_t* contents, const pugi::char_t* indent, unsigned int flags) @@ -73,11 +72,7 @@ bool test_double_nan(double value) #ifndef PUGIXML_NO_XPATH static size_t strlength(const pugi::char_t* s) { -#ifdef PUGIXML_WCHAR_MODE - return wcslen(s); -#else - return strlen(s); -#endif + return std::char_traits::length(s); } bool test_xpath_string(const pugi::xpath_node& node, const pugi::char_t* query, pugi::xpath_variable_set* variables, const pugi::char_t* expected) diff --git a/tests/test.hpp b/tests/test.hpp index dd14af68..4d13bed4 100644 --- a/tests/test.hpp +++ b/tests/test.hpp @@ -9,6 +9,8 @@ #include #endif +#include + struct test_runner { test_runner(const char* name) @@ -154,6 +156,39 @@ struct dummy_fixture {}; #define STR(text) PUGIXML_TEXT(text) +#ifdef PUGIXML_CHAR8_MODE +# if defined(__clang__) || defined(__GNUC__) +# define ALIASING_BARRIER(ptr) asm volatile("" : : "rm"(ptr) : "memory") +# else +# define ALIASING_BARRIER(ptr) +# endif +inline const char8_t* char_cast(const char* bytes) +{ + ALIASING_BARRIER(bytes); + return reinterpret_cast(bytes); +} +#endif + +#ifdef PUGIXML_WCHAR_MODE +#define RAW(text) L ## text +#elif defined(PUGIXML_CHAR8_MODE) +#define RAW(text) char_cast(text) +#else +#define RAW(text) text +#endif + +#if defined(PUGIXML_CHAR8_MODE) +#define U8RAW(text) char_cast(text) +#else +#define U8RAW(text) text +#endif + +#ifdef PUGIXML_CHAR8_MODE +#define U8STR(text) u8 ## text +#else +#define U8STR(text) text +#endif + #if defined(__DMC__) || defined(__BORLANDC__) #define U_LITERALS // DMC does not understand \x01234 (it parses first three digits), but understands \u01234 #endif diff --git a/tests/test_document.cpp b/tests/test_document.cpp index fca6bd9f..29dab2c5 100644 --- a/tests/test_document.cpp +++ b/tests/test_document.cpp @@ -919,6 +919,8 @@ inline void check_utftest_document(const xml_document& doc) size_t wcharsize = sizeof(wchar_t); CHECK(wcharsize == 2 ? (v[7] == wchar_cast(0xd852) && v[8] == wchar_cast(0xdf62)) : (v[7] == wchar_cast(0x24b62))); +#elif defined(PUGIXML_CHAR8_MODE) + CHECK_STRING(v, u8"\u4E16\u754C\u6709\u5F88\u591A\u8BED\u8A00\U00024B62"); #else // unicode string CHECK_STRING(v, "\xe4\xb8\x96\xe7\x95\x8c\xe6\x9c\x89\xe5\xbe\x88\xe5\xa4\x9a\xe8\xaf\xad\xe8\xa8\x80\xf0\xa4\xad\xa2"); @@ -1524,6 +1526,8 @@ TEST(document_load_buffer_utf_truncated) #ifdef PUGIXML_WCHAR_MODE CHECK(name[0] == 0x20ac && name[1] == 0); + #elif defined(PUGIXML_CHAR8_MODE) + CHECK_STRING(name, u8"\u20AC"); #else CHECK_STRING(name, "\xe2\x82\xac"); #endif @@ -1569,6 +1573,8 @@ TEST(document_load_stream_truncated) #ifdef PUGIXML_WCHAR_MODE CHECK(name[0] == 0x20ac && name[1] == 0); + #elif defined(PUGIXML_CHAR8_MODE) + CHECK_STRING(name, u8"\u20AC"); #else CHECK_STRING(name, "\xe2\x82\xac"); #endif diff --git a/tests/test_dom_traverse.cpp b/tests/test_dom_traverse.cpp index 29b4dfd5..aeb33221 100644 --- a/tests/test_dom_traverse.cpp +++ b/tests/test_dom_traverse.cpp @@ -682,6 +682,8 @@ struct find_predicate_prefix #ifdef PUGIXML_WCHAR_MODE // can't use wcsncmp here because of a bug in DMC return std::basic_string(obj.name()).compare(0, wcslen(prefix), prefix) == 0; + #elif defined(PUGIXML_CHAR8_MODE) + return strncmp(reinterpret_cast(obj.name()), reinterpret_cast(prefix), strlen(reinterpret_cast(prefix))) == 0; #else return strncmp(obj.name(), prefix, strlen(prefix)) == 0; #endif @@ -807,6 +809,8 @@ struct test_walker: xml_tree_walker std::copy(buf, buf + strlen(buf) + 1, &wbuf[0]); return std::basic_string(wbuf); + #elif defined(PUGIXML_CHAR8_MODE) + return std::basic_string(char_cast(buf)); #else return std::basic_string(buf); #endif diff --git a/tests/test_parse.cpp b/tests/test_parse.cpp index 180c70ad..cd83a680 100644 --- a/tests/test_parse.cpp +++ b/tests/test_parse.cpp @@ -568,6 +568,8 @@ TEST(parse_escapes_unicode) size_t wcharsize = sizeof(wchar_t); CHECK(v[0] == 0x3b3 && v[1] == 0x3b3 && (wcharsize == 2 ? v[2] == wchar_cast(0xd852) && v[3] == wchar_cast(0xdf62) : v[2] == wchar_cast(0x24b62))); +#elif defined(PUGIXML_CHAR8_MODE) + CHECK_STRING(doc.child_value(STR("node")), u8"\u03B3\u03B3\U00024B62"); #else CHECK_STRING(doc.child_value(STR("node")), "\xce\xb3\xce\xb3\xf0\xa4\xad\xa2"); #endif @@ -1104,6 +1106,8 @@ TEST(parse_bom_fragment_invalid_utf8) #ifdef PUGIXML_WCHAR_MODE CHECK(value[0] == wchar_cast(0xfefb) && value[1] == 0); +#elif defined(PUGIXML_CHAR8_MODE) + CHECK(value[0] == 0xef && value[1] == 0xbb && value[2] == 0xbb); #else CHECK_STRING(value, "\xef\xbb\xbb"); #endif @@ -1119,6 +1123,8 @@ TEST(parse_bom_fragment_invalid_utf16) #ifdef PUGIXML_WCHAR_MODE CHECK(value[0] == wchar_cast(0xfffe) && value[1] == 0); +#elif defined(PUGIXML_CHAR8_MODE) + CHECK(value[0] == 0xef && value[1] == 0xbf && value[2] == 0xbe); #else CHECK_STRING(value, "\xef\xbf\xbe"); #endif @@ -1134,6 +1140,8 @@ TEST(parse_bom_fragment_invalid_utf32) #ifdef PUGIXML_WCHAR_MODE CHECK(value[0] == wchar_cast(0xffff) && value[1] == 0); +#elif defined(PUGIXML_CHAR8_MODE) + CHECK(value[0] == 0xef && value[1] == 0xbf && value[2] == 0xbf); #else CHECK_STRING(value, "\xef\xbf\xbf"); #endif diff --git a/tests/test_parse_doctype.cpp b/tests/test_parse_doctype.cpp index 649170d0..02eefb92 100644 --- a/tests/test_parse_doctype.cpp +++ b/tests/test_parse_doctype.cpp @@ -16,6 +16,10 @@ static xml_parse_result load_concat(xml_document& doc, const char_t* a, const ch wcscpy(buffer, a); wcscat(buffer, b); wcscat(buffer, c); +#elif defined(PUGIXML_CHAR8_MODE) + strcpy(reinterpret_cast(buffer), reinterpret_cast(a)); + strcat(reinterpret_cast(buffer), reinterpret_cast(b)); + strcat(reinterpret_cast(buffer), reinterpret_cast(c)); #else strcpy(buffer, a); strcat(buffer, b); diff --git a/tests/test_unicode.cpp b/tests/test_unicode.cpp index 4cb61142..e45e30a0 100644 --- a/tests/test_unicode.cpp +++ b/tests/test_unicode.cpp @@ -80,16 +80,16 @@ TEST(as_wide_string) TEST(as_utf8_empty) { - CHECK(as_utf8(L"") == ""); + CHECK(as_utf8(L"") == U8STR("")); } TEST(as_utf8_valid_basic) { // valid 1-byte, 2-byte and 3-byte outputs #ifdef U_LITERALS - CHECK(as_utf8(L"?\u0400\u203D") == "?\xd0\x80\xe2\x80\xbd"); + CHECK(as_utf8(L"?\u0400\u203D") == U8RAW("?\xd0\x80\xe2\x80\xbd")); #else - CHECK(as_utf8(L"?\x0400\x203D") == "?\xd0\x80\xe2\x80\xbd"); + CHECK(as_utf8(L"?\x0400\x203D") == U8RAW("?\xd0\x80\xe2\x80\xbd")); #endif } @@ -106,14 +106,14 @@ TEST(as_utf8_valid_astral) s[1] = ' '; s[2] = wchar_cast(0x1003ff); - CHECK(as_utf8(s.c_str()) == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + CHECK(as_utf8(s.c_str()) == U8RAW("\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf")); } else { #ifdef U_LITERALS - CHECK(as_utf8(L"\uda1d\ude24 \udbc0\udfff") == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + CHECK(as_utf8(L"\uda1d\ude24 \udbc0\udfff") == U8RAW("\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf")); #else - CHECK(as_utf8(L"\xda1d\xde24 \xdbc0\xdfff") == "\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf"); + CHECK(as_utf8(L"\xda1d\xde24 \xdbc0\xdfff") == U8RAW("\xf2\x97\x98\xa4 \xf4\x80\x8f\xbf")); #endif } } @@ -129,17 +129,17 @@ TEST(as_utf8_invalid) CHECK(as_utf8(L"a\uda1d") == "a"); CHECK(as_utf8(L"a\uda1d_") == "a_"); #else - CHECK(as_utf8(L"a\xda1d") == "a"); - CHECK(as_utf8(L"a\xda1d_") == "a_"); + CHECK(as_utf8(L"a\xda1d") == U8STR("a")); + CHECK(as_utf8(L"a\xda1d_") == U8STR("a_")); #endif // check incorrect leading code #ifdef U_LITERALS - CHECK(as_utf8(L"a\ude24") == "a"); - CHECK(as_utf8(L"a\ude24_") == "a_"); + CHECK(as_utf8(L"a\ude24") == STR("a")); + CHECK(as_utf8(L"a\ude24_") == STR("a_")); #else - CHECK(as_utf8(L"a\xde24") == "a"); - CHECK(as_utf8(L"a\xde24_") == "a_"); + CHECK(as_utf8(L"a\xde24") == U8STR("a")); + CHECK(as_utf8(L"a\xde24_") == U8STR("a_")); #endif } } @@ -148,6 +148,6 @@ TEST(as_utf8_string) { std::basic_string s = L"abcd"; - CHECK(as_utf8(s) == "abcd"); + CHECK(as_utf8(s) == U8STR("abcd")); } #endif diff --git a/tests/test_write.cpp b/tests/test_write.cpp index 0410e82e..d4a9aca9 100644 --- a/tests/test_write.cpp +++ b/tests/test_write.cpp @@ -220,6 +220,8 @@ TEST_XML(write_escape_unicode, "") #else CHECK_NODE(doc, STR("")); #endif +#elif defined(PUGIXML_CHAR8_MODE) + CHECK_NODE(doc, STR("")); #else CHECK_NODE(doc, STR("")); #endif @@ -370,11 +372,11 @@ TEST(write_encoding_huge) const unsigned int N = 16000; // make a large utf8 name consisting of 3-byte chars (3 does not divide internal buffer size, so will need split correction) - std::string s_utf8 = "<"; + std::basic_string s_utf8 = STR("<"); - for (unsigned int i = 0; i < N; ++i) s_utf8 += "\xE2\x82\xAC"; + for (unsigned int i = 0; i < N; ++i) s_utf8 += RAW("\xE2\x82\xAC"); - s_utf8 += "/>"; + s_utf8 += STR("/>"); xml_document doc; CHECK(doc.load_buffer(&s_utf8[0], s_utf8.length(), parse_default, encoding_utf8)); @@ -393,9 +395,9 @@ TEST(write_encoding_huge_invalid) const unsigned int N = 16000; // make a large utf8 name consisting of non-leading chars - std::string s_utf8; + std::basic_string s_utf8; - for (unsigned int i = 0; i < N; ++i) s_utf8 += "\x82"; + for (unsigned int i = 0; i < N; ++i) s_utf8 += RAW("\x82"); xml_document doc; doc.append_child().set_name(s_utf8.c_str()); @@ -451,7 +453,7 @@ TEST(write_unicode_invalid_utf16) } } #else -static bool test_write_unicode_invalid(const char* name, const wchar_t* expected) +static bool test_write_unicode_invalid(const char_t* name, const wchar_t* expected) { xml_document doc; doc.append_child(node_pcdata).set_value(name); @@ -462,31 +464,31 @@ static bool test_write_unicode_invalid(const char* name, const wchar_t* expected TEST(write_unicode_invalid_utf8) { // invalid 1-byte input - CHECK(test_write_unicode_invalid("a\xb0", L"a")); - CHECK(test_write_unicode_invalid("a\xb0_", L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xb0"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xb0_"), L"a_")); // invalid 2-byte input - CHECK(test_write_unicode_invalid("a\xc0", L"a")); - CHECK(test_write_unicode_invalid("a\xd0", L"a")); - CHECK(test_write_unicode_invalid("a\xc0_", L"a_")); - CHECK(test_write_unicode_invalid("a\xd0_", L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xc0"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xd0"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xc0_"), L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xd0_"), L"a_")); // invalid 3-byte input - CHECK(test_write_unicode_invalid("a\xe2\x80", L"a")); - CHECK(test_write_unicode_invalid("a\xe2", L"a")); - CHECK(test_write_unicode_invalid("a\xe2\x80_", L"a_")); - CHECK(test_write_unicode_invalid("a\xe2_", L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xe2\x80"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xe2"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xe2\x80_"), L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xe2_"), L"a_")); // invalid 4-byte input - CHECK(test_write_unicode_invalid("a\xf2\x97\x98", L"a")); - CHECK(test_write_unicode_invalid("a\xf2\x97", L"a")); - CHECK(test_write_unicode_invalid("a\xf2", L"a")); - CHECK(test_write_unicode_invalid("a\xf2\x97\x98_", L"a_")); - CHECK(test_write_unicode_invalid("a\xf2\x97_", L"a_")); - CHECK(test_write_unicode_invalid("a\xf2_", L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xf2\x97\x98"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xf2\x97"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xf2"), L"a")); + CHECK(test_write_unicode_invalid(RAW("a\xf2\x97\x98_"), L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xf2\x97_"), L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xf2_"), L"a_")); // invalid 5-byte input - CHECK(test_write_unicode_invalid("a\xf8_", L"a_")); + CHECK(test_write_unicode_invalid(RAW("a\xf8_"), L"a_")); } #endif diff --git a/tests/test_xpath_functions.cpp b/tests/test_xpath_functions.cpp index 0d1d6307..75777819 100644 --- a/tests/test_xpath_functions.cpp +++ b/tests/test_xpath_functions.cpp @@ -570,7 +570,7 @@ TEST(xpath_string_translate_table) CHECK_XPATH_STRING(c, STR("translate('abcd\xe9 ', 'abc', 'ABC')"), STR("ABCd\xe9 ")); CHECK_XPATH_STRING(c, STR("translate('abcd\xe9 ', 'abc\xe9', 'ABC!')"), STR("ABCd! ")); - CHECK_XPATH_STRING(c, STR("translate('abcd! ', 'abc!', 'ABC\xe9')"), STR("ABCd\xe9 ")); + CHECK_XPATH_STRING(c, RAW("translate('abcd! ', 'abc!', 'ABC\xe9')"), RAW("ABCd\xe9 ")); CHECK_XPATH_STRING(c, STR("translate('abcde', concat('abc', 'd'), 'ABCD')"), STR("ABCDe")); CHECK_XPATH_STRING(c, STR("translate('abcde', 'abcd', concat('ABC', 'D'))"), STR("ABCDe")); } diff --git a/tests/test_xpath_parse.cpp b/tests/test_xpath_parse.cpp index bfa59ff3..0b640a61 100644 --- a/tests/test_xpath_parse.cpp +++ b/tests/test_xpath_parse.cpp @@ -224,7 +224,7 @@ TEST(xpath_parse_paths_valid_unicode) #if defined(PUGIXML_WCHAR_MODE) xpath_query q(paths[i]); #elif !defined(PUGIXML_NO_STL) - std::basic_string path_utf8 = as_utf8(paths[i]); + std::basic_string path_utf8 = as_utf8(paths[i]); xpath_query q(path_utf8.c_str()); #endif } diff --git a/tests/test_xpath_variables.cpp b/tests/test_xpath_variables.cpp index 1a1fc19f..56894862 100644 --- a/tests/test_xpath_variables.cpp +++ b/tests/test_xpath_variables.cpp @@ -410,7 +410,7 @@ TEST(xpath_variables_name_unicode) const char_t* name = L"\x0400\x203D"; #endif #else - const char_t* name = "\xd0\x80\xe2\x80\xbd"; + const char_t* name = STR("\xd0\x80\xe2\x80\xbd"); #endif xpath_variable_set set;