Merge branch 'dev'

tomwillow · Mar 6, 2024 · 8953f5b · 8953f5b
2 parents 5a41bf9 + a7fdd3a
commit 8953f5b
Show file tree

Hide file tree

Showing 13 changed files with 392 additions and 59 deletions.
diff --git a/README-en.md b/README-en.md
@@ -11,10 +11,10 @@ Function:
 - Check whether characters are lost to ensure that the conversion process is reversible
 - Support command line (use $ ./SmartCharsetConverter --help for details)
 - Multi-language support (click the "hammer" button in the bottom right corner to switch languages)
+- Support Vietnamese charset(VNI/VPS/VISCII/TCVN3)（Currently unable to detect these charset. please use the "No File Filter" mode）
 
 Supported Platform:
 
-- Win11 x64
 - Win10 x64
 - Win7 x64 (haven’t tried it yet)
 
@@ -74,6 +74,8 @@ v0.81 Add Spanish language pack support (thanks to [Carlos Sánchez](https://git
 
 v0.82 Check if characters will be lost when specifying encoding manually.
 
+v0.9 Support multiple Vietnamese charset converting: VNI/VPS/VISICII/TCVN3
+
 # Build
 
 1. Confirm the compilation environment: win10+ x64, Visual Studio 2019+, cmake.

diff --git a/README.md b/README.md
@@ -11,10 +11,10 @@
 - 转换时会检查是否丢失字符，确保转换过程可逆
 - 支持命令行（使用 $ ./SmartCharsetConverter --help 查看）
 - 多语言支持（点击右下角“锤子”按钮切换语言）
+- 支持越南语字符集（VNI/VPS/VISCII/TCVN3）和其他字符集互转（目前还不能自动探测这几个字符集，请使用“不过滤”模式）
 
 运行要求：
 
-- Win11 x64
 - Win10 x64
 - Win7 x64（理论上可以，没尝试）
 
@@ -70,8 +70,11 @@ v0.8 重新编排界面(感谢[Carlos Sánchez](https://github.com/c-sanchez))
 支持多语言（内置简体中文和英文）。增加多语言选择。
 
 v0.81 增加西班牙语支持(感谢[Carlos Sánchez](https://github.com/c-sanchez))。
+
 v0.82 现在手动指定字符集会检查是否丢失字符。
 
+v0.9 支持多个越南语字符集的转换：VNI/VPS/VISICII/TCVN3
+
 # 构建方法
 
 1. 确认编译环境：win10+ x64, Visual Studio 2019+, cmake。

diff --git a/src/CLIHandler.cpp b/src/CLIHandler.cpp
@@ -302,7 +302,7 @@ int CLIMain(const std::vector<std::wstring> &args) noexcept {
             return;
         }
 
-        Core::ConvertResult ret = core.Convert(inputFilename, addedItem.srcCharset, addedItem.srcLineBreak);
+        Core::ConvertFileResult ret = core.Convert(inputFilename, addedItem.srcCharset, addedItem.srcLineBreak);
         if (ret.errInfo.has_value()) {
             wcout << L"  大小: " << FileSizeToTString(addedItem.filesize) << L"\n";
             wcout << L"  字符集: " << ToViewCharsetName(addedItem.srcCharset) << L"\n";

diff --git a/src/Core/CharsetCode.h b/src/Core/CharsetCode.h
@@ -33,6 +33,11 @@ enum class CharsetCode {
     WINDOWS_1258, // Vietnamese
     ISO_8859_1,
 
+    VNI,    // Vietnamese
+    VPS,    // Vietnamese
+    VISCII, // Vietnamese
+    TCVN3,  // Vietnamese
+
     CHARSET_CODE_END
 
     // 添加字符集需要同步修改：charsetCodeMap
@@ -43,32 +48,39 @@ struct MyCharset {
     std::string icuName;   // the name used by icu
     std::unordered_set<std::string>
         icuNames; // if icu detected these charset names, map all of them to be the main charset
+    bool isVietnameseLocalCharset;
 };
 
 // 字符集code到名称的映射表
 const std::unordered_map<CharsetCode, MyCharset> charsetCodeMap = {
     // CharsetCode枚举值, viewName显示名称, icuName, 可能的别名
-    {CharsetCode::UNKNOWN, MyCharset{TEXT("未知"), "-", {}}},
-    {CharsetCode::EMPTY, MyCharset{TEXT("空"), "-", {}}},
-    {CharsetCode::NOT_SUPPORTED, MyCharset{TEXT("不支持"), "-", {}}},
-    {CharsetCode::UTF8, MyCharset{TEXT("UTF-8"), "UTF-8", {"ASCII", "ANSI", "UTF8"}}},
-    {CharsetCode::UTF8BOM, MyCharset{TEXT("UTF-8 BOM"), "UTF-8", {}}},
-    {CharsetCode::GB18030, MyCharset{TEXT("GB18030"), "GB18030", {"GB"}}},
-
-    {CharsetCode::UTF16LE, MyCharset{TEXT("UTF-16LE"), "UTF-16LE", {}}},
-    {CharsetCode::UTF16LEBOM, MyCharset{TEXT("UTF-16LE BOM"), "UTF-16LE", {}}},
-    {CharsetCode::UTF16BE, MyCharset{TEXT("UTF-16BE"), "UTF-16BE", {}}},
-    {CharsetCode::UTF16BEBOM, MyCharset{TEXT("UTF-16BE BOM"), "UTF-16BE", {}}},
-    {CharsetCode::UTF32LE, MyCharset{TEXT("UTF-32LE"), "UTF-32LE", {}}},
-    {CharsetCode::UTF32LEBOM, MyCharset{TEXT("UTF-32LE BOM"), "UTF-32LE", {}}},
-    {CharsetCode::UTF32BE, MyCharset{TEXT("UTF-32BE"), "UTF-32BE", {}}},
-    {CharsetCode::UTF32BEBOM, MyCharset{TEXT("UTF-32BE BOM"), "UTF-32BE", {}}},
-    {CharsetCode::BIG5, MyCharset{TEXT("BIG5"), "Big5", {"Big5"}}},
-    {CharsetCode::SHIFT_JIS, MyCharset{TEXT("SHIFT-JIS"), "SHIFT-JIS", {"SHIFT_JIS"}}},
-    {CharsetCode::EUC_JP, MyCharset{TEXT("EUC-JP"), "EUC-JP", {"EUC-JP"}}},
-    {CharsetCode::WINDOWS_1252, MyCharset{TEXT("WINDOWS-1252"), "WINDOWS-1252", {}}},
-    {CharsetCode::WINDOWS_1258, MyCharset{TEXT("WINDOWS-1258"), "WINDOWS-1258", {}}},
-    {CharsetCode::ISO_8859_1, MyCharset{TEXT("ISO-8859-1"), "ISO-8859-1", {}}}};
+    {CharsetCode::UNKNOWN, MyCharset{TEXT("未知"), "-", {}, false}},
+    {CharsetCode::EMPTY, MyCharset{TEXT("空"), "-", {}, false}},
+    {CharsetCode::NOT_SUPPORTED, MyCharset{TEXT("不支持"), "-", {}, false}},
+    {CharsetCode::UTF8, MyCharset{TEXT("UTF-8"), "UTF-8", {"ASCII", "ANSI", "UTF8"}, false}},
+    {CharsetCode::UTF8BOM, MyCharset{TEXT("UTF-8 BOM"), "UTF-8", {}, false}},
+    {CharsetCode::GB18030, MyCharset{TEXT("GB18030"), "GB18030", {"GB"}, false}},
+
+    {CharsetCode::UTF16LE, MyCharset{TEXT("UTF-16LE"), "UTF-16LE", {}, false}},
+    {CharsetCode::UTF16LEBOM, MyCharset{TEXT("UTF-16LE BOM"), "UTF-16LE", {}, false}},
+    {CharsetCode::UTF16BE, MyCharset{TEXT("UTF-16BE"), "UTF-16BE", {}, false}},
+    {CharsetCode::UTF16BEBOM, MyCharset{TEXT("UTF-16BE BOM"), "UTF-16BE", {}, false}},
+    {CharsetCode::UTF32LE, MyCharset{TEXT("UTF-32LE"), "UTF-32LE", {}, false}},
+    {CharsetCode::UTF32LEBOM, MyCharset{TEXT("UTF-32LE BOM"), "UTF-32LE", {}, false}},
+    {CharsetCode::UTF32BE, MyCharset{TEXT("UTF-32BE"), "UTF-32BE", {}, false}},
+    {CharsetCode::UTF32BEBOM, MyCharset{TEXT("UTF-32BE BOM"), "UTF-32BE", {}, false}},
+    {CharsetCode::BIG5, MyCharset{TEXT("BIG5"), "Big5", {"Big5"}, false}},
+    {CharsetCode::SHIFT_JIS, MyCharset{TEXT("SHIFT-JIS"), "SHIFT-JIS", {"SHIFT_JIS"}, false}},
+    {CharsetCode::EUC_JP, MyCharset{TEXT("EUC-JP"), "EUC-JP", {"EUC-JP"}, false}},
+    {CharsetCode::WINDOWS_1252, MyCharset{TEXT("WINDOWS-1252"), "WINDOWS-1252", {}, false}},
+    {CharsetCode::WINDOWS_1258, MyCharset{TEXT("WINDOWS-1258"), "WINDOWS-1258", {}, false}},
+    {CharsetCode::ISO_8859_1, MyCharset{TEXT("ISO-8859-1"), "ISO-8859-1", {}, false}},
+
+    {CharsetCode::VNI, MyCharset{TEXT("VNI"), "", {}, true}},
+    {CharsetCode::VPS, MyCharset{TEXT("VPS"), "", {}, true}},
+    {CharsetCode::VISCII, MyCharset{TEXT("VISCII"), "", {}, true}},
+    {CharsetCode::TCVN3, MyCharset{TEXT("TCVN3"), "", {}, true}},
+};
 
 std::tstring ToViewCharsetName(CharsetCode code) noexcept;
 

diff --git a/src/Core/Core.cpp b/src/Core/Core.cpp
@@ -23,6 +23,11 @@ std::u16string Decode(std::string_view src, CharsetCode code) {
         return {};
     }
 
+    if (charsetCodeMap.at(code).isVietnameseLocalCharset) {
+        viet::Init();
+        return viet::ConvertToUtf16LE(src, CharsetCodeToVietEncoding(code));
+    }
+
     // 从code转换到icu的字符集名称
     auto icuCharsetName = ToICUCharsetName(code);
 
@@ -57,7 +62,7 @@ struct FromUFLAGContext {
     UConverterFromUCallback subCallback;
     const void *subContext;
     std::vector<UChar32> unassigned; // 是否出现了不能转换的字符
-    FromUFLAGContext() : subCallback(nullptr), subContext(nullptr), unassigned(false) {}
+    FromUFLAGContext() : subCallback(nullptr), subContext(nullptr) {}
 };
 
 /**
@@ -133,6 +138,11 @@ U_CAPI void U_EXPORT2 flagCB_fromU(const void *context, UConverterFromUnicodeArg
 }
 
 std::string Encode(std::u16string_view src, CharsetCode targetCode) {
+    if (charsetCodeMap.at(targetCode).isVietnameseLocalCharset) {
+        viet::Init();
+        return viet::ConvertFromUtf16LE(src, CharsetCodeToVietEncoding(targetCode));
+    }
+
     // 从code转换到icu的字符集名称
     auto icuCharsetName = ToICUCharsetName(targetCode);
 
@@ -183,12 +193,29 @@ std::string Encode(std::u16string_view src, CharsetCode targetCode) {
         // UTF16LE -> UTF8
         std::string ret = Encode(temp, CharsetCode::UTF8);
 
-        throw runtime_error(GetLanguageService().GetUtf8String(StringId::WILL_LOST_CHARACTERS) + ret);
+        throw UnassignedCharError(ret);
     }
 
     return target;
 }
 
+std::string Convert(std::string_view src, ConvertParam inputParam) {
+    // 根据原编码得到Unicode字符串
+    std::u16string buf = Decode(src, inputParam.originCode);
+
+    // 如果需要转换换行符
+    if (inputParam.doConvertLineBreaks) {
+        ChangeLineBreaks(buf, inputParam.targetLineBreak);
+    }
+
+    // 转到目标编码
+    return Encode(buf, inputParam.targetCode);
+}
+
+viet::Encoding CharsetCodeToVietEncoding(CharsetCode code) noexcept {
+    return viet::to_encoding(to_utf8(ToViewCharsetName(code)));
+}
+
 Core::Core(std::tstring configFileName, CoreInitOption opt) : configFileName(configFileName), opt(opt) {
     // 读ini
     ReadConfigFromFile();
@@ -199,14 +226,17 @@ Core::Core(std::tstring configFileName, CoreInitOption opt) : configFileName(con
     });
 
 #ifndef NDEBUG
-    UErrorCode err;
-    auto allNames = ucnv_openAllNames(&err);
-    while (1) {
-        auto name = uenum_next(allNames, nullptr, &err);
-        if (name == nullptr) {
-            break;
-        }
-    }
+    // =================================
+    // ==== will detect memory leak ====
+    // UErrorCode err;
+    // UEnumeration *allNames = ucnv_openAllNames(&err);
+    // while (1) {
+    //    auto name = uenum_next(allNames, nullptr, &err);
+    //    if (name == nullptr) {
+    //        break;
+    //    }
+    //}
+    // ================================
 #endif
 }
 
@@ -378,11 +408,11 @@ void Core::Clear() {
     listFileNames.clear();
 }
 
-Core::ConvertResult Core::Convert(const std::tstring &inputFilename, CharsetCode originCode,
-                                  LineBreaks originLineBreak) noexcept {
+Core::ConvertFileResult Core::Convert(const std::tstring &inputFilename, CharsetCode originCode,
+                                      LineBreaks originLineBreak) noexcept {
     CharsetCode targetCode = config.outputCharset;
 
-    ConvertResult ret;
+    ConvertFileResult ret;
     try {
         ret.outputFileName = inputFilename;
         ret.targetLineBreaks = originLineBreak;
@@ -464,17 +494,26 @@ Core::ConvertResult Core::Convert(const std::tstring &inputFilename, CharsetCode
                     rawSize -= bomSize;
                 }
 
-                // 根据原编码得到Unicode字符串
-                auto buf = Decode(std::string_view(rawStart, rawSize), originCode);
+                ConvertParam param;
+                param.originCode = originCode;
+                param.targetCode = targetCode;
+                param.doConvertLineBreaks =
+                    GetConfig().enableConvertLineBreaks && GetConfig().lineBreak != originLineBreak;
+                param.targetLineBreak = GetConfig().lineBreak;
 
-                // 如果需要转换换行符
-                if (GetConfig().enableConvertLineBreaks && GetConfig().lineBreak != originLineBreak) {
-                    ChangeLineBreaks(buf, GetConfig().lineBreak);
-                    ret.targetLineBreaks = GetConfig().lineBreak;
+                // 转到目标编码
+                std::string outputBuf;
+                try {
+                    outputBuf = ::Convert(std::string_view(rawStart, rawSize), param);
+                } catch (const UnassignedCharError &err) {
+                    throw std::runtime_error(GetLanguageService().GetUtf8String(StringId::WILL_LOST_CHARACTERS) +
+                                             err.what());
+                };
+
+                if (param.doConvertLineBreaks) {
+                    ret.targetLineBreaks = param.targetLineBreak;
                 }
 
-                // 转到目标编码
-                auto outputBuf = Encode(buf, targetCode);
                 ret.outputFileSize = 0;
 
                 // 写入文件

diff --git a/src/Core/Core.h b/src/Core/Core.h
@@ -4,6 +4,7 @@
 #include "CharsetCode.h"
 #include "LineBreaks.h"
 #include "Config.h"
+#include "Vietnamese.h"
 
 #include <tstring.h>
 
@@ -30,15 +31,44 @@
  */
 std::u16string Decode(std::string_view src, CharsetCode code);
 
+/**
+ * 不可分配字符错误
+ * 用于转换时出现不能转换到指定编码的情形。
+ * err.what()方法会返回不能转换的字符组成的字符串(utf-8编码)。
+ */
+class UnassignedCharError : public std::runtime_error {
+public:
+    UnassignedCharError(const std::string &unassignedChars) : std::runtime_error(unassignedChars) {}
+};
+
 /**
  * @brief 把unicode串编码为指定字符集
  * @param src u16string(UTF-16LE)
  * @return std::string CAUTION: this string is only as a container of char[] with the charset of targetCode.
  *          NOT mean its charset is ASCII or ANSI or others.
- * @exception runtime_error ucnv出错/出现了不能转换的字符
+ * @exception viet::ConvertError
+ * @exception UnassignedCharError 出现了不能转换的字符
+ * @exception std::runtime_error ucnv出错
  */
 std::string Encode(std::u16string_view src, CharsetCode targetCode);
 
+struct ConvertParam {
+    CharsetCode originCode;
+    CharsetCode targetCode;
+    bool doConvertLineBreaks;
+    LineBreaks targetLineBreak; // target line break. if doConvertLineBreaks is false, this variable will be ignored.
+};
+
+/**
+ * Convert encoding.
+ * @exception viet::ConvertError
+ * @exception UnassignedCharError 出现了不能转换的字符
+ * @exception std::runtime_error ucnv出错
+ */
+std::string Convert(std::string_view src, ConvertParam inputParam);
+
+viet::Encoding CharsetCodeToVietEncoding(CharsetCode code) noexcept;
+
 class io_error_ignore : public std::runtime_error {
 public:
     io_error_ignore() : runtime_error("ignored") {}
@@ -99,7 +129,7 @@ class Core {
 
     void Clear();
 
-    struct ConvertResult {
+    struct ConvertFileResult {
         std::tstring outputFileName;
         std::optional<std::string> errInfo;
         LineBreaks targetLineBreaks;
@@ -110,8 +140,8 @@ class Core {
      * @brief 转换一个文件。
      * @return <输出文件的文件名, 出错信息>
      */
-    ConvertResult Convert(const std::tstring &inputFilename, CharsetCode originCode,
-                          LineBreaks originLineBreak) noexcept;
+    ConvertFileResult Convert(const std::tstring &inputFilename, CharsetCode originCode,
+                              LineBreaks originLineBreak) noexcept;
 
 private:
     std::tstring configFileName;

diff --git a/src/Core/Detect.cpp b/src/Core/Detect.cpp
@@ -37,13 +37,16 @@ std::tuple<std::string, int> DetectByUCharDet(uchardet *det, const char *buf, in
 
 std::tuple<std::string, int> DetectByUCSDet(const char *buf, int bufSize) {
     UErrorCode status = U_ZERO_ERROR;
-    UCharsetDetector *csd = ucsdet_open(&status);
+    auto csd =
+        std::unique_ptr<UCharsetDetector, void (*)(UCharsetDetector *)>(ucsdet_open(&status), [](UCharsetDetector *p) {
+            ucsdet_close(p);
+        });
     DealWithUCNVError(status);
 
-    ucsdet_setText(csd, buf, bufSize, &status);
+    ucsdet_setText(csd.get(), buf, bufSize, &status);
     DealWithUCNVError(status);
 
-    const UCharsetMatch *ucm = ucsdet_detect(csd, &status);
+    const UCharsetMatch *ucm = ucsdet_detect(csd.get(), &status);
     DealWithUCNVError(status);
 
     int32_t confidence = ucsdet_getConfidence(ucm, &status);

diff --git a/src/Core/Vietnamese.h b/src/Core/Vietnamese.h
@@ -28,7 +28,7 @@ inline std::string_view to_string(Encoding encoding) noexcept {
     case Encoding::UTF8:
         return "UTF8";
     case Encoding::UTF16LE:
-        return "UTF16LE";
+        return "UTF-16LE";
     case Encoding::VNI:
         return "VNI";
     case Encoding::VPS:
@@ -43,6 +43,29 @@ inline std::string_view to_string(Encoding encoding) noexcept {
     return "";
 }
 
+inline Encoding to_encoding(std::string_view sv) noexcept {
+    if (sv == "UTF8") {
+        return Encoding::UTF8;
+    }
+    if (sv == "UTF-16LE") {
+        return Encoding::UTF8;
+    }
+    if (sv == "VNI") {
+        return Encoding::VNI;
+    }
+    if (sv == "VPS") {
+        return Encoding::VPS;
+    }
+    if (sv == "VISCII") {
+        return Encoding::VISCII;
+    }
+    if (sv == "TCVN3") {
+        return Encoding::TCVN3;
+    }
+    assert(0);
+    return Encoding::UTF8;
+}
+
 class ConvertError : public std::runtime_error {
 public:
     ConvertError(std::string content, int position, Encoding srcEncoding, Encoding destEncoding) noexcept;