diff --git a/README-en.md b/README-en.md index 3f66c2c..6c6917a 100644 --- a/README-en.md +++ b/README-en.md @@ -11,10 +11,10 @@ Function: - Check whether characters are lost to ensure that the conversion process is reversible - Support command line (use $ ./SmartCharsetConverter --help for details) - Multi-language support (click the "hammer" button in the bottom right corner to switch languages) +- Support Vietnamese charset(VNI/VPS/VISCII/TCVN3)(Currently unable to detect these charset. please use the "No File Filter" mode) Supported Platform: -- Win11 x64 - Win10 x64 - Win7 x64 (haven’t tried it yet) @@ -74,6 +74,8 @@ v0.81 Add Spanish language pack support (thanks to [Carlos Sánchez](https://git v0.82 Check if characters will be lost when specifying encoding manually. +v0.9 Support multiple Vietnamese charset converting: VNI/VPS/VISICII/TCVN3 + # Build 1. Confirm the compilation environment: win10+ x64, Visual Studio 2019+, cmake. diff --git a/README.md b/README.md index acd0dee..c0c4a9b 100644 --- a/README.md +++ b/README.md @@ -11,10 +11,10 @@ - 转换时会检查是否丢失字符,确保转换过程可逆 - 支持命令行(使用 $ ./SmartCharsetConverter --help 查看) - 多语言支持(点击右下角“锤子”按钮切换语言) +- 支持越南语字符集(VNI/VPS/VISCII/TCVN3)和其他字符集互转(目前还不能自动探测这几个字符集,请使用“不过滤”模式) 运行要求: -- Win11 x64 - Win10 x64 - Win7 x64(理论上可以,没尝试) @@ -70,8 +70,11 @@ v0.8 重新编排界面(感谢[Carlos Sánchez](https://github.com/c-sanchez)) 支持多语言(内置简体中文和英文)。增加多语言选择。 v0.81 增加西班牙语支持(感谢[Carlos Sánchez](https://github.com/c-sanchez))。 + v0.82 现在手动指定字符集会检查是否丢失字符。 +v0.9 支持多个越南语字符集的转换:VNI/VPS/VISICII/TCVN3 + # 构建方法 1. 确认编译环境:win10+ x64, Visual Studio 2019+, cmake。 diff --git a/src/CLIHandler.cpp b/src/CLIHandler.cpp index d418579..7a8a56b 100644 --- a/src/CLIHandler.cpp +++ b/src/CLIHandler.cpp @@ -302,7 +302,7 @@ int CLIMain(const std::vector &args) noexcept { return; } - Core::ConvertResult ret = core.Convert(inputFilename, addedItem.srcCharset, addedItem.srcLineBreak); + Core::ConvertFileResult ret = core.Convert(inputFilename, addedItem.srcCharset, addedItem.srcLineBreak); if (ret.errInfo.has_value()) { wcout << L" 大小: " << FileSizeToTString(addedItem.filesize) << L"\n"; wcout << L" 字符集: " << ToViewCharsetName(addedItem.srcCharset) << L"\n"; diff --git a/src/Core/CharsetCode.h b/src/Core/CharsetCode.h index 33f27c6..034515b 100644 --- a/src/Core/CharsetCode.h +++ b/src/Core/CharsetCode.h @@ -33,6 +33,11 @@ enum class CharsetCode { WINDOWS_1258, // Vietnamese ISO_8859_1, + VNI, // Vietnamese + VPS, // Vietnamese + VISCII, // Vietnamese + TCVN3, // Vietnamese + CHARSET_CODE_END // 添加字符集需要同步修改:charsetCodeMap @@ -43,32 +48,39 @@ struct MyCharset { std::string icuName; // the name used by icu std::unordered_set icuNames; // if icu detected these charset names, map all of them to be the main charset + bool isVietnameseLocalCharset; }; // 字符集code到名称的映射表 const std::unordered_map charsetCodeMap = { // CharsetCode枚举值, viewName显示名称, icuName, 可能的别名 - {CharsetCode::UNKNOWN, MyCharset{TEXT("未知"), "-", {}}}, - {CharsetCode::EMPTY, MyCharset{TEXT("空"), "-", {}}}, - {CharsetCode::NOT_SUPPORTED, MyCharset{TEXT("不支持"), "-", {}}}, - {CharsetCode::UTF8, MyCharset{TEXT("UTF-8"), "UTF-8", {"ASCII", "ANSI", "UTF8"}}}, - {CharsetCode::UTF8BOM, MyCharset{TEXT("UTF-8 BOM"), "UTF-8", {}}}, - {CharsetCode::GB18030, MyCharset{TEXT("GB18030"), "GB18030", {"GB"}}}, - - {CharsetCode::UTF16LE, MyCharset{TEXT("UTF-16LE"), "UTF-16LE", {}}}, - {CharsetCode::UTF16LEBOM, MyCharset{TEXT("UTF-16LE BOM"), "UTF-16LE", {}}}, - {CharsetCode::UTF16BE, MyCharset{TEXT("UTF-16BE"), "UTF-16BE", {}}}, - {CharsetCode::UTF16BEBOM, MyCharset{TEXT("UTF-16BE BOM"), "UTF-16BE", {}}}, - {CharsetCode::UTF32LE, MyCharset{TEXT("UTF-32LE"), "UTF-32LE", {}}}, - {CharsetCode::UTF32LEBOM, MyCharset{TEXT("UTF-32LE BOM"), "UTF-32LE", {}}}, - {CharsetCode::UTF32BE, MyCharset{TEXT("UTF-32BE"), "UTF-32BE", {}}}, - {CharsetCode::UTF32BEBOM, MyCharset{TEXT("UTF-32BE BOM"), "UTF-32BE", {}}}, - {CharsetCode::BIG5, MyCharset{TEXT("BIG5"), "Big5", {"Big5"}}}, - {CharsetCode::SHIFT_JIS, MyCharset{TEXT("SHIFT-JIS"), "SHIFT-JIS", {"SHIFT_JIS"}}}, - {CharsetCode::EUC_JP, MyCharset{TEXT("EUC-JP"), "EUC-JP", {"EUC-JP"}}}, - {CharsetCode::WINDOWS_1252, MyCharset{TEXT("WINDOWS-1252"), "WINDOWS-1252", {}}}, - {CharsetCode::WINDOWS_1258, MyCharset{TEXT("WINDOWS-1258"), "WINDOWS-1258", {}}}, - {CharsetCode::ISO_8859_1, MyCharset{TEXT("ISO-8859-1"), "ISO-8859-1", {}}}}; + {CharsetCode::UNKNOWN, MyCharset{TEXT("未知"), "-", {}, false}}, + {CharsetCode::EMPTY, MyCharset{TEXT("空"), "-", {}, false}}, + {CharsetCode::NOT_SUPPORTED, MyCharset{TEXT("不支持"), "-", {}, false}}, + {CharsetCode::UTF8, MyCharset{TEXT("UTF-8"), "UTF-8", {"ASCII", "ANSI", "UTF8"}, false}}, + {CharsetCode::UTF8BOM, MyCharset{TEXT("UTF-8 BOM"), "UTF-8", {}, false}}, + {CharsetCode::GB18030, MyCharset{TEXT("GB18030"), "GB18030", {"GB"}, false}}, + + {CharsetCode::UTF16LE, MyCharset{TEXT("UTF-16LE"), "UTF-16LE", {}, false}}, + {CharsetCode::UTF16LEBOM, MyCharset{TEXT("UTF-16LE BOM"), "UTF-16LE", {}, false}}, + {CharsetCode::UTF16BE, MyCharset{TEXT("UTF-16BE"), "UTF-16BE", {}, false}}, + {CharsetCode::UTF16BEBOM, MyCharset{TEXT("UTF-16BE BOM"), "UTF-16BE", {}, false}}, + {CharsetCode::UTF32LE, MyCharset{TEXT("UTF-32LE"), "UTF-32LE", {}, false}}, + {CharsetCode::UTF32LEBOM, MyCharset{TEXT("UTF-32LE BOM"), "UTF-32LE", {}, false}}, + {CharsetCode::UTF32BE, MyCharset{TEXT("UTF-32BE"), "UTF-32BE", {}, false}}, + {CharsetCode::UTF32BEBOM, MyCharset{TEXT("UTF-32BE BOM"), "UTF-32BE", {}, false}}, + {CharsetCode::BIG5, MyCharset{TEXT("BIG5"), "Big5", {"Big5"}, false}}, + {CharsetCode::SHIFT_JIS, MyCharset{TEXT("SHIFT-JIS"), "SHIFT-JIS", {"SHIFT_JIS"}, false}}, + {CharsetCode::EUC_JP, MyCharset{TEXT("EUC-JP"), "EUC-JP", {"EUC-JP"}, false}}, + {CharsetCode::WINDOWS_1252, MyCharset{TEXT("WINDOWS-1252"), "WINDOWS-1252", {}, false}}, + {CharsetCode::WINDOWS_1258, MyCharset{TEXT("WINDOWS-1258"), "WINDOWS-1258", {}, false}}, + {CharsetCode::ISO_8859_1, MyCharset{TEXT("ISO-8859-1"), "ISO-8859-1", {}, false}}, + + {CharsetCode::VNI, MyCharset{TEXT("VNI"), "", {}, true}}, + {CharsetCode::VPS, MyCharset{TEXT("VPS"), "", {}, true}}, + {CharsetCode::VISCII, MyCharset{TEXT("VISCII"), "", {}, true}}, + {CharsetCode::TCVN3, MyCharset{TEXT("TCVN3"), "", {}, true}}, +}; std::tstring ToViewCharsetName(CharsetCode code) noexcept; diff --git a/src/Core/Core.cpp b/src/Core/Core.cpp index cde9005..f0e1827 100644 --- a/src/Core/Core.cpp +++ b/src/Core/Core.cpp @@ -23,6 +23,11 @@ std::u16string Decode(std::string_view src, CharsetCode code) { return {}; } + if (charsetCodeMap.at(code).isVietnameseLocalCharset) { + viet::Init(); + return viet::ConvertToUtf16LE(src, CharsetCodeToVietEncoding(code)); + } + // 从code转换到icu的字符集名称 auto icuCharsetName = ToICUCharsetName(code); @@ -57,7 +62,7 @@ struct FromUFLAGContext { UConverterFromUCallback subCallback; const void *subContext; std::vector unassigned; // 是否出现了不能转换的字符 - FromUFLAGContext() : subCallback(nullptr), subContext(nullptr), unassigned(false) {} + FromUFLAGContext() : subCallback(nullptr), subContext(nullptr) {} }; /** @@ -133,6 +138,11 @@ U_CAPI void U_EXPORT2 flagCB_fromU(const void *context, UConverterFromUnicodeArg } std::string Encode(std::u16string_view src, CharsetCode targetCode) { + if (charsetCodeMap.at(targetCode).isVietnameseLocalCharset) { + viet::Init(); + return viet::ConvertFromUtf16LE(src, CharsetCodeToVietEncoding(targetCode)); + } + // 从code转换到icu的字符集名称 auto icuCharsetName = ToICUCharsetName(targetCode); @@ -183,12 +193,29 @@ std::string Encode(std::u16string_view src, CharsetCode targetCode) { // UTF16LE -> UTF8 std::string ret = Encode(temp, CharsetCode::UTF8); - throw runtime_error(GetLanguageService().GetUtf8String(StringId::WILL_LOST_CHARACTERS) + ret); + throw UnassignedCharError(ret); } return target; } +std::string Convert(std::string_view src, ConvertParam inputParam) { + // 根据原编码得到Unicode字符串 + std::u16string buf = Decode(src, inputParam.originCode); + + // 如果需要转换换行符 + if (inputParam.doConvertLineBreaks) { + ChangeLineBreaks(buf, inputParam.targetLineBreak); + } + + // 转到目标编码 + return Encode(buf, inputParam.targetCode); +} + +viet::Encoding CharsetCodeToVietEncoding(CharsetCode code) noexcept { + return viet::to_encoding(to_utf8(ToViewCharsetName(code))); +} + Core::Core(std::tstring configFileName, CoreInitOption opt) : configFileName(configFileName), opt(opt) { // 读ini ReadConfigFromFile(); @@ -199,14 +226,17 @@ Core::Core(std::tstring configFileName, CoreInitOption opt) : configFileName(con }); #ifndef NDEBUG - UErrorCode err; - auto allNames = ucnv_openAllNames(&err); - while (1) { - auto name = uenum_next(allNames, nullptr, &err); - if (name == nullptr) { - break; - } - } + // ================================= + // ==== will detect memory leak ==== + // UErrorCode err; + // UEnumeration *allNames = ucnv_openAllNames(&err); + // while (1) { + // auto name = uenum_next(allNames, nullptr, &err); + // if (name == nullptr) { + // break; + // } + //} + // ================================ #endif } @@ -378,11 +408,11 @@ void Core::Clear() { listFileNames.clear(); } -Core::ConvertResult Core::Convert(const std::tstring &inputFilename, CharsetCode originCode, - LineBreaks originLineBreak) noexcept { +Core::ConvertFileResult Core::Convert(const std::tstring &inputFilename, CharsetCode originCode, + LineBreaks originLineBreak) noexcept { CharsetCode targetCode = config.outputCharset; - ConvertResult ret; + ConvertFileResult ret; try { ret.outputFileName = inputFilename; ret.targetLineBreaks = originLineBreak; @@ -464,17 +494,26 @@ Core::ConvertResult Core::Convert(const std::tstring &inputFilename, CharsetCode rawSize -= bomSize; } - // 根据原编码得到Unicode字符串 - auto buf = Decode(std::string_view(rawStart, rawSize), originCode); + ConvertParam param; + param.originCode = originCode; + param.targetCode = targetCode; + param.doConvertLineBreaks = + GetConfig().enableConvertLineBreaks && GetConfig().lineBreak != originLineBreak; + param.targetLineBreak = GetConfig().lineBreak; - // 如果需要转换换行符 - if (GetConfig().enableConvertLineBreaks && GetConfig().lineBreak != originLineBreak) { - ChangeLineBreaks(buf, GetConfig().lineBreak); - ret.targetLineBreaks = GetConfig().lineBreak; + // 转到目标编码 + std::string outputBuf; + try { + outputBuf = ::Convert(std::string_view(rawStart, rawSize), param); + } catch (const UnassignedCharError &err) { + throw std::runtime_error(GetLanguageService().GetUtf8String(StringId::WILL_LOST_CHARACTERS) + + err.what()); + }; + + if (param.doConvertLineBreaks) { + ret.targetLineBreaks = param.targetLineBreak; } - // 转到目标编码 - auto outputBuf = Encode(buf, targetCode); ret.outputFileSize = 0; // 写入文件 diff --git a/src/Core/Core.h b/src/Core/Core.h index 42ecde7..363ac80 100644 --- a/src/Core/Core.h +++ b/src/Core/Core.h @@ -4,6 +4,7 @@ #include "CharsetCode.h" #include "LineBreaks.h" #include "Config.h" +#include "Vietnamese.h" #include @@ -30,15 +31,44 @@ */ std::u16string Decode(std::string_view src, CharsetCode code); +/** + * 不可分配字符错误 + * 用于转换时出现不能转换到指定编码的情形。 + * err.what()方法会返回不能转换的字符组成的字符串(utf-8编码)。 + */ +class UnassignedCharError : public std::runtime_error { +public: + UnassignedCharError(const std::string &unassignedChars) : std::runtime_error(unassignedChars) {} +}; + /** * @brief 把unicode串编码为指定字符集 * @param src u16string(UTF-16LE) * @return std::string CAUTION: this string is only as a container of char[] with the charset of targetCode. * NOT mean its charset is ASCII or ANSI or others. - * @exception runtime_error ucnv出错/出现了不能转换的字符 + * @exception viet::ConvertError + * @exception UnassignedCharError 出现了不能转换的字符 + * @exception std::runtime_error ucnv出错 */ std::string Encode(std::u16string_view src, CharsetCode targetCode); +struct ConvertParam { + CharsetCode originCode; + CharsetCode targetCode; + bool doConvertLineBreaks; + LineBreaks targetLineBreak; // target line break. if doConvertLineBreaks is false, this variable will be ignored. +}; + +/** + * Convert encoding. + * @exception viet::ConvertError + * @exception UnassignedCharError 出现了不能转换的字符 + * @exception std::runtime_error ucnv出错 + */ +std::string Convert(std::string_view src, ConvertParam inputParam); + +viet::Encoding CharsetCodeToVietEncoding(CharsetCode code) noexcept; + class io_error_ignore : public std::runtime_error { public: io_error_ignore() : runtime_error("ignored") {} @@ -99,7 +129,7 @@ class Core { void Clear(); - struct ConvertResult { + struct ConvertFileResult { std::tstring outputFileName; std::optional errInfo; LineBreaks targetLineBreaks; @@ -110,8 +140,8 @@ class Core { * @brief 转换一个文件。 * @return <输出文件的文件名, 出错信息> */ - ConvertResult Convert(const std::tstring &inputFilename, CharsetCode originCode, - LineBreaks originLineBreak) noexcept; + ConvertFileResult Convert(const std::tstring &inputFilename, CharsetCode originCode, + LineBreaks originLineBreak) noexcept; private: std::tstring configFileName; diff --git a/src/Core/Detect.cpp b/src/Core/Detect.cpp index 1458c62..a33cf76 100644 --- a/src/Core/Detect.cpp +++ b/src/Core/Detect.cpp @@ -37,13 +37,16 @@ std::tuple DetectByUCharDet(uchardet *det, const char *buf, in std::tuple DetectByUCSDet(const char *buf, int bufSize) { UErrorCode status = U_ZERO_ERROR; - UCharsetDetector *csd = ucsdet_open(&status); + auto csd = + std::unique_ptr(ucsdet_open(&status), [](UCharsetDetector *p) { + ucsdet_close(p); + }); DealWithUCNVError(status); - ucsdet_setText(csd, buf, bufSize, &status); + ucsdet_setText(csd.get(), buf, bufSize, &status); DealWithUCNVError(status); - const UCharsetMatch *ucm = ucsdet_detect(csd, &status); + const UCharsetMatch *ucm = ucsdet_detect(csd.get(), &status); DealWithUCNVError(status); int32_t confidence = ucsdet_getConfidence(ucm, &status); diff --git a/src/Core/Vietnamese.h b/src/Core/Vietnamese.h index 0fc3437..3898bed 100644 --- a/src/Core/Vietnamese.h +++ b/src/Core/Vietnamese.h @@ -28,7 +28,7 @@ inline std::string_view to_string(Encoding encoding) noexcept { case Encoding::UTF8: return "UTF8"; case Encoding::UTF16LE: - return "UTF16LE"; + return "UTF-16LE"; case Encoding::VNI: return "VNI"; case Encoding::VPS: @@ -43,6 +43,29 @@ inline std::string_view to_string(Encoding encoding) noexcept { return ""; } +inline Encoding to_encoding(std::string_view sv) noexcept { + if (sv == "UTF8") { + return Encoding::UTF8; + } + if (sv == "UTF-16LE") { + return Encoding::UTF8; + } + if (sv == "VNI") { + return Encoding::VNI; + } + if (sv == "VPS") { + return Encoding::VPS; + } + if (sv == "VISCII") { + return Encoding::VISCII; + } + if (sv == "TCVN3") { + return Encoding::TCVN3; + } + assert(0); + return Encoding::UTF8; +} + class ConvertError : public std::runtime_error { public: ConvertError(std::string content, int position, Encoding srcEncoding, Encoding destEncoding) noexcept; diff --git a/src/DialogMain.cpp b/src/DialogMain.cpp index 18b1aac..4c917e5 100644 --- a/src/DialogMain.cpp +++ b/src/DialogMain.cpp @@ -18,7 +18,7 @@ #undef min #undef max -const std::tstring appTitle = TEXT("SmartCharsetConverter v0.82 by Tom Willow"); +const std::tstring appTitle = TEXT("SmartCharsetConverter v0.9 by Tom Willow"); const std::tstring configFileName = TEXT("SmartCharsetConverter.json"); @@ -899,10 +899,10 @@ LRESULT DialogMain::OnDropFiles(UINT uMsg, WPARAM wParam, LPARAM lParam, BOOL &b UINT nFileNum = DragQueryFile(hDrop, 0xFFFFFFFF, NULL, 0); // 拖拽文件个数 TCHAR strFileName[MAX_PATH]; for (UINT i = 0; i < nFileNum; i++) { - DragQueryFile(hDrop, i, strFileName, MAX_PATH); //获得拖曳的文件名 + DragQueryFile(hDrop, i, strFileName, MAX_PATH); // 获得拖曳的文件名 filenames.push_back(strFileName); } - DragFinish(hDrop); //释放hDrop + DragFinish(hDrop); // 释放hDrop AddItemsAsync(filenames); diff --git a/src/test/Core_Vietnamese_test.cpp b/src/test/Core_Vietnamese_test.cpp new file mode 100644 index 0000000..a9f4f53 --- /dev/null +++ b/src/test/Core_Vietnamese_test.cpp @@ -0,0 +1,139 @@ +#include "config.h" + +#include "Core/Core.h" + +#include +#include + +#include + +#include +#include +#include + +TEST(CoreVietnamese, ConvertToUtf8) { + SetConsoleOutputCP(65001); // 设置代码页为UTF-8 + + std::wstring inputFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-tcvn.txt"; + auto [buf, bufSize] = ReadFileToBuffer(inputFilename); + + ConvertParam param; + param.originCode = CharsetCode::TCVN3; + param.targetCode = CharsetCode::UTF8; + param.doConvertLineBreaks = false; + std::string utf8Str = Convert(std::string_view(buf.get(), bufSize), param); + // WriteFileFromBuffer(utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-got.txt", utf8Str.c_str(), + // utf8Str.size()); + + std::wstring expectFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-utf8.txt"; + auto [utf8Buf, utf8BufSize] = ReadFileToBuffer(expectFilename); + std::string utf8ExpectStr(utf8Buf.get(), utf8BufSize); + + ASSERT_EQ(utf8Str.size(), utf8BufSize); + ASSERT_EQ(utf8Str, utf8ExpectStr); +} + +TEST(CoreVietnamese, ConvertToUtf16LE) { + SetConsoleOutputCP(65001); // 设置代码页为UTF-8 + + std::wstring inputFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-tcvn.txt"; + auto [buf, bufSize] = ReadFileToBuffer(inputFilename); + + ConvertParam param; + param.originCode = CharsetCode::TCVN3; + param.targetCode = CharsetCode::UTF16LE; + param.doConvertLineBreaks = false; + std::string ret = Convert(std::string_view(buf.get(), bufSize), param); + std::u16string utf16LEStr; + utf16LEStr.resize(ret.size() / sizeof(char16_t)); + memcpy(utf16LEStr.data(), ret.data(), ret.size()); + + // WriteFileFromBuffer(utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-got.txt", utf8Str.c_str(), + // utf8Str.size()); + + std::wstring expectFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-utf16le.txt"; + auto [utf16LEBuf, utf16LEBufSize] = ReadFileToBuffer(expectFilename); + std::size_t utf16LEBufPsudoCharNums = utf16LEBufSize / sizeof(char16_t); + std::u16string utf16LEExpectStr(reinterpret_cast(utf16LEBuf.get()), utf16LEBufPsudoCharNums); + + ASSERT_EQ(utf16LEStr.size(), utf16LEBufPsudoCharNums); + ASSERT_EQ(utf16LEStr, utf16LEExpectStr); +} + +TEST(CoreVietnamese, ConvertFromUtf8) { + SetConsoleOutputCP(65001); // 设置代码页为UTF-8 + + std::wstring inputFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-utf8.txt"; + auto [buf, bufSize] = ReadFileToBuffer(inputFilename); + + ConvertParam param; + param.originCode = CharsetCode::UTF8; + param.targetCode = CharsetCode::TCVN3; + param.doConvertLineBreaks = false; + std::string tcvn3StrGot = Convert(std::string_view(buf.get(), bufSize), param); + // WriteFileFromBuffer(utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-got.txt", utf8Str.c_str(), + // utf8Str.size()); + + std::wstring expectFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-tcvn.txt"; + auto [tcvn3BufExpected, tcvn3BufExpectedSize] = ReadFileToBuffer(expectFilename); + std::string tcvn3StrExpected(tcvn3BufExpected.get(), tcvn3BufExpectedSize); + + ASSERT_EQ(tcvn3StrGot.size(), tcvn3BufExpectedSize); + ASSERT_EQ(tcvn3StrGot, tcvn3StrExpected); +} + +TEST(CoreVietnamese, ConvertFromUtf16LE) { + SetConsoleOutputCP(65001); // 设置代码页为UTF-8 + + std::wstring inputFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-utf16le.txt"; + auto [buf, bufSize] = ReadFileToBuffer(inputFilename); + + ConvertParam param; + param.originCode = CharsetCode::UTF16LE; + param.targetCode = CharsetCode::TCVN3; + param.doConvertLineBreaks = false; + std::string tcvn3StrGot = Convert(std::string_view((buf.get()), bufSize), param); + // WriteFileFromBuffer(utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-got.txt", utf8Str.c_str(), + // utf8Str.size()); + + std::wstring expectFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-tcvn.txt"; + auto [tcvn3BufExpected, tcvn3BufExpectedSize] = ReadFileToBuffer(expectFilename); + std::string tcvn3StrExpected(tcvn3BufExpected.get(), tcvn3BufExpectedSize); + + ASSERT_EQ(tcvn3StrGot.size(), tcvn3BufExpectedSize); + ASSERT_EQ(tcvn3StrGot, tcvn3StrExpected); +} + +/** + * @exception file_io_error + * ConvertError + */ +void TestBuiltinConvertOtherToOther(CharsetCode middleEncoding) { + + std::wstring inputFilename = utf8_to_wstring(SmartCharsetConverter_TEST_DIR) + L"/tcvn/demo1-tcvn.txt"; + auto [buf, bufSize] = ReadFileToBuffer(inputFilename); + + std::string vniStr; + + ConvertParam param; + param.originCode = CharsetCode::TCVN3; + param.targetCode = middleEncoding; + param.doConvertLineBreaks = false; + EXPECT_NO_THROW(vniStr = Convert(std::string_view(buf.get(), bufSize), param)); + + std::string tcvnStrGot; + param.originCode = middleEncoding; + param.targetCode = CharsetCode::TCVN3; + EXPECT_NO_THROW(tcvnStrGot = Convert(vniStr, param)); + + EXPECT_EQ(bufSize, tcvnStrGot.size()); + EXPECT_EQ(std::string(buf.get(), bufSize), tcvnStrGot); +} + +TEST(CoreVietnamese, ConvertOtherToOther) { + SetConsoleOutputCP(65001); // 设置代码页为UTF-8 + + TestBuiltinConvertOtherToOther(CharsetCode::VNI); + TestBuiltinConvertOtherToOther(CharsetCode::VPS); + TestBuiltinConvertOtherToOther(CharsetCode::VISCII); +} diff --git a/src/test/Core_test.cpp b/src/test/Core_test.cpp index 6e9bef5..8d54267 100644 --- a/src/test/Core_test.cpp +++ b/src/test/Core_test.cpp @@ -1,5 +1,8 @@ #include "config.h" +#include "memory_leak_detection.h" + +#include #include #include #include @@ -10,7 +13,7 @@ #include #include -TEST(Core, DetectEncoding) { +void fun() { SetConsoleOutputCP(65001); // 设置代码页为UTF-8 std::string filename = std::string(SmartCharsetConverter_TEST_DIR) + "/expected.txt"; @@ -28,14 +31,14 @@ TEST(Core, DetectEncoding) { auto lineBreak = GetLineBreaks(utf16leStr.data(), utf16leStr.size()); core.SetOutputCharset(CharsetCode::GB18030); - Core::ConvertResult ret = core.Convert(utf8_to_wstring(filename), code, lineBreak); + Core::ConvertFileResult ret = core.Convert(utf8_to_wstring(filename), code, lineBreak); ASSERT_FALSE(ret.errInfo.has_value()); std::filesystem::rename("./expected.txt", "expected-out.txt"); { core.SetOutputCharset(CharsetCode::UTF8); - Core::ConvertResult ret = + Core::ConvertFileResult ret = core.Convert(utf8_to_wstring(u8"./expected-out.txt"), CharsetCode::GB18030, lineBreak); ASSERT_FALSE(ret.errInfo.has_value()); @@ -45,6 +48,24 @@ TEST(Core, DetectEncoding) { } } +TEST(Core, EncodeWithUnassignedChars) { + SetConsoleOutputCP(65001); // 设置代码页为UTF-8 + // MemoryLeakDetection mld; + + try { + Encode(u"abcdefg小舟从此逝,江海寄余生。asdfghjkl", CharsetCode::WINDOWS_1252); + FAIL(); + } catch (const UnassignedCharError &err) { + ASSERT_EQ(std::string(err.what()), std::string(u8"小舟从此逝,江海寄余生。")); + } +} + +TEST(Core, DetectEncoding) { + // MemoryLeakDetection mld; + + fun(); +} + TEST(Core, DetectEncodingMulti) { SetConsoleOutputCP(65001); // 设置代码页为UTF-8 diff --git a/src/test/memory_leak_detection.h b/src/test/memory_leak_detection.h new file mode 100644 index 0000000..b768f55 --- /dev/null +++ b/src/test/memory_leak_detection.h @@ -0,0 +1,10 @@ +#pragma once + +#ifdef WIN32 +#include "memory_leak_detection_win.h" +#else +class MemoryLeakDetection final { +public: + MemoryLeakDetection() {} +}; +#endif diff --git a/src/test/memory_leak_detection_win.h b/src/test/memory_leak_detection_win.h new file mode 100644 index 0000000..938cbde --- /dev/null +++ b/src/test/memory_leak_detection_win.h @@ -0,0 +1,51 @@ +#pragma once + +#ifdef WIN32 + +#include + +#undef max +#undef min + +#define _CRTDBG_MAP_ALLOC // to get more details +#include +#include //for malloc and free + +#include + +#include +#include + +class MemoryLeakDetection final { +public: + MemoryLeakDetection() { + _CrtMemCheckpoint(&sOld); // take a snapshot + } + + ~MemoryLeakDetection() { + _CrtMemCheckpoint(&sNew); // take a snapshot + if (_CrtMemDifference(&sDiff, &sOld, &sNew)) // if there is a difference + { + // OutputDebugString(TEXT("-----------_CrtMemDumpStatistics ---------")); + //_CrtMemDumpStatistics(&sDiff); + // OutputDebugString(TEXT("-----------_CrtMemDumpAllObjectsSince ---------")); + //_CrtMemDumpAllObjectsSince(&sOld); + // OutputDebugString(TEXT("-----------_CrtDumpMemoryLeaks ---------")); + _CrtDumpMemoryLeaks(); + + EXPECT_TRUE(0 && "Memory leak is detected! See debug output for detail."); + } + } + + void SetBreakAlloc(long index) const noexcept { + (index); + _CrtSetBreakAlloc(index); + } + +private: + _CrtMemState sOld; + _CrtMemState sNew; + _CrtMemState sDiff; +}; + +#endif \ No newline at end of file