Skip to content

Commit

Permalink
Merge branch 'dev'
Browse files Browse the repository at this point in the history
  • Loading branch information
tomwillow committed Mar 6, 2024
2 parents 5a41bf9 + a7fdd3a commit 8953f5b
Show file tree
Hide file tree
Showing 13 changed files with 392 additions and 59 deletions.
4 changes: 3 additions & 1 deletion README-en.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ Function:
- Check whether characters are lost to ensure that the conversion process is reversible
- Support command line (use $ ./SmartCharsetConverter --help for details)
- Multi-language support (click the "hammer" button in the bottom right corner to switch languages)
- Support Vietnamese charset(VNI/VPS/VISCII/TCVN3)(Currently unable to detect these charset. please use the "No File Filter" mode)

Supported Platform:

- Win11 x64
- Win10 x64
- Win7 x64 (haven’t tried it yet)

Expand Down Expand Up @@ -74,6 +74,8 @@ v0.81 Add Spanish language pack support (thanks to [Carlos Sánchez](https://git

v0.82 Check if characters will be lost when specifying encoding manually.

v0.9 Support multiple Vietnamese charset converting: VNI/VPS/VISICII/TCVN3

# Build

1. Confirm the compilation environment: win10+ x64, Visual Studio 2019+, cmake.
Expand Down
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@
- 转换时会检查是否丢失字符,确保转换过程可逆
- 支持命令行(使用 $ ./SmartCharsetConverter --help 查看)
- 多语言支持(点击右下角“锤子”按钮切换语言)
- 支持越南语字符集(VNI/VPS/VISCII/TCVN3)和其他字符集互转(目前还不能自动探测这几个字符集,请使用“不过滤”模式)

运行要求:

- Win11 x64
- Win10 x64
- Win7 x64(理论上可以,没尝试)

Expand Down Expand Up @@ -70,8 +70,11 @@ v0.8 重新编排界面(感谢[Carlos Sánchez](https://github.com/c-sanchez))
支持多语言(内置简体中文和英文)。增加多语言选择。

v0.81 增加西班牙语支持(感谢[Carlos Sánchez](https://github.com/c-sanchez))。

v0.82 现在手动指定字符集会检查是否丢失字符。

v0.9 支持多个越南语字符集的转换:VNI/VPS/VISICII/TCVN3

# 构建方法

1. 确认编译环境:win10+ x64, Visual Studio 2019+, cmake。
Expand Down
2 changes: 1 addition & 1 deletion src/CLIHandler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -302,7 +302,7 @@ int CLIMain(const std::vector<std::wstring> &args) noexcept {
return;
}

Core::ConvertResult ret = core.Convert(inputFilename, addedItem.srcCharset, addedItem.srcLineBreak);
Core::ConvertFileResult ret = core.Convert(inputFilename, addedItem.srcCharset, addedItem.srcLineBreak);
if (ret.errInfo.has_value()) {
wcout << L" 大小: " << FileSizeToTString(addedItem.filesize) << L"\n";
wcout << L" 字符集: " << ToViewCharsetName(addedItem.srcCharset) << L"\n";
Expand Down
54 changes: 33 additions & 21 deletions src/Core/CharsetCode.h
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ enum class CharsetCode {
WINDOWS_1258, // Vietnamese
ISO_8859_1,

VNI, // Vietnamese
VPS, // Vietnamese
VISCII, // Vietnamese
TCVN3, // Vietnamese

CHARSET_CODE_END

// 添加字符集需要同步修改:charsetCodeMap
Expand All @@ -43,32 +48,39 @@ struct MyCharset {
std::string icuName; // the name used by icu
std::unordered_set<std::string>
icuNames; // if icu detected these charset names, map all of them to be the main charset
bool isVietnameseLocalCharset;
};

// 字符集code到名称的映射表
const std::unordered_map<CharsetCode, MyCharset> charsetCodeMap = {
// CharsetCode枚举值, viewName显示名称, icuName, 可能的别名
{CharsetCode::UNKNOWN, MyCharset{TEXT("未知"), "-", {}}},
{CharsetCode::EMPTY, MyCharset{TEXT(""), "-", {}}},
{CharsetCode::NOT_SUPPORTED, MyCharset{TEXT("不支持"), "-", {}}},
{CharsetCode::UTF8, MyCharset{TEXT("UTF-8"), "UTF-8", {"ASCII", "ANSI", "UTF8"}}},
{CharsetCode::UTF8BOM, MyCharset{TEXT("UTF-8 BOM"), "UTF-8", {}}},
{CharsetCode::GB18030, MyCharset{TEXT("GB18030"), "GB18030", {"GB"}}},

{CharsetCode::UTF16LE, MyCharset{TEXT("UTF-16LE"), "UTF-16LE", {}}},
{CharsetCode::UTF16LEBOM, MyCharset{TEXT("UTF-16LE BOM"), "UTF-16LE", {}}},
{CharsetCode::UTF16BE, MyCharset{TEXT("UTF-16BE"), "UTF-16BE", {}}},
{CharsetCode::UTF16BEBOM, MyCharset{TEXT("UTF-16BE BOM"), "UTF-16BE", {}}},
{CharsetCode::UTF32LE, MyCharset{TEXT("UTF-32LE"), "UTF-32LE", {}}},
{CharsetCode::UTF32LEBOM, MyCharset{TEXT("UTF-32LE BOM"), "UTF-32LE", {}}},
{CharsetCode::UTF32BE, MyCharset{TEXT("UTF-32BE"), "UTF-32BE", {}}},
{CharsetCode::UTF32BEBOM, MyCharset{TEXT("UTF-32BE BOM"), "UTF-32BE", {}}},
{CharsetCode::BIG5, MyCharset{TEXT("BIG5"), "Big5", {"Big5"}}},
{CharsetCode::SHIFT_JIS, MyCharset{TEXT("SHIFT-JIS"), "SHIFT-JIS", {"SHIFT_JIS"}}},
{CharsetCode::EUC_JP, MyCharset{TEXT("EUC-JP"), "EUC-JP", {"EUC-JP"}}},
{CharsetCode::WINDOWS_1252, MyCharset{TEXT("WINDOWS-1252"), "WINDOWS-1252", {}}},
{CharsetCode::WINDOWS_1258, MyCharset{TEXT("WINDOWS-1258"), "WINDOWS-1258", {}}},
{CharsetCode::ISO_8859_1, MyCharset{TEXT("ISO-8859-1"), "ISO-8859-1", {}}}};
{CharsetCode::UNKNOWN, MyCharset{TEXT("未知"), "-", {}, false}},
{CharsetCode::EMPTY, MyCharset{TEXT(""), "-", {}, false}},
{CharsetCode::NOT_SUPPORTED, MyCharset{TEXT("不支持"), "-", {}, false}},
{CharsetCode::UTF8, MyCharset{TEXT("UTF-8"), "UTF-8", {"ASCII", "ANSI", "UTF8"}, false}},
{CharsetCode::UTF8BOM, MyCharset{TEXT("UTF-8 BOM"), "UTF-8", {}, false}},
{CharsetCode::GB18030, MyCharset{TEXT("GB18030"), "GB18030", {"GB"}, false}},

{CharsetCode::UTF16LE, MyCharset{TEXT("UTF-16LE"), "UTF-16LE", {}, false}},
{CharsetCode::UTF16LEBOM, MyCharset{TEXT("UTF-16LE BOM"), "UTF-16LE", {}, false}},
{CharsetCode::UTF16BE, MyCharset{TEXT("UTF-16BE"), "UTF-16BE", {}, false}},
{CharsetCode::UTF16BEBOM, MyCharset{TEXT("UTF-16BE BOM"), "UTF-16BE", {}, false}},
{CharsetCode::UTF32LE, MyCharset{TEXT("UTF-32LE"), "UTF-32LE", {}, false}},
{CharsetCode::UTF32LEBOM, MyCharset{TEXT("UTF-32LE BOM"), "UTF-32LE", {}, false}},
{CharsetCode::UTF32BE, MyCharset{TEXT("UTF-32BE"), "UTF-32BE", {}, false}},
{CharsetCode::UTF32BEBOM, MyCharset{TEXT("UTF-32BE BOM"), "UTF-32BE", {}, false}},
{CharsetCode::BIG5, MyCharset{TEXT("BIG5"), "Big5", {"Big5"}, false}},
{CharsetCode::SHIFT_JIS, MyCharset{TEXT("SHIFT-JIS"), "SHIFT-JIS", {"SHIFT_JIS"}, false}},
{CharsetCode::EUC_JP, MyCharset{TEXT("EUC-JP"), "EUC-JP", {"EUC-JP"}, false}},
{CharsetCode::WINDOWS_1252, MyCharset{TEXT("WINDOWS-1252"), "WINDOWS-1252", {}, false}},
{CharsetCode::WINDOWS_1258, MyCharset{TEXT("WINDOWS-1258"), "WINDOWS-1258", {}, false}},
{CharsetCode::ISO_8859_1, MyCharset{TEXT("ISO-8859-1"), "ISO-8859-1", {}, false}},

{CharsetCode::VNI, MyCharset{TEXT("VNI"), "", {}, true}},
{CharsetCode::VPS, MyCharset{TEXT("VPS"), "", {}, true}},
{CharsetCode::VISCII, MyCharset{TEXT("VISCII"), "", {}, true}},
{CharsetCode::TCVN3, MyCharset{TEXT("TCVN3"), "", {}, true}},
};

std::tstring ToViewCharsetName(CharsetCode code) noexcept;

Expand Down
81 changes: 60 additions & 21 deletions src/Core/Core.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,11 @@ std::u16string Decode(std::string_view src, CharsetCode code) {
return {};
}

if (charsetCodeMap.at(code).isVietnameseLocalCharset) {
viet::Init();
return viet::ConvertToUtf16LE(src, CharsetCodeToVietEncoding(code));
}

// 从code转换到icu的字符集名称
auto icuCharsetName = ToICUCharsetName(code);

Expand Down Expand Up @@ -57,7 +62,7 @@ struct FromUFLAGContext {
UConverterFromUCallback subCallback;
const void *subContext;
std::vector<UChar32> unassigned; // 是否出现了不能转换的字符
FromUFLAGContext() : subCallback(nullptr), subContext(nullptr), unassigned(false) {}
FromUFLAGContext() : subCallback(nullptr), subContext(nullptr) {}
};

/**
Expand Down Expand Up @@ -133,6 +138,11 @@ U_CAPI void U_EXPORT2 flagCB_fromU(const void *context, UConverterFromUnicodeArg
}

std::string Encode(std::u16string_view src, CharsetCode targetCode) {
if (charsetCodeMap.at(targetCode).isVietnameseLocalCharset) {
viet::Init();
return viet::ConvertFromUtf16LE(src, CharsetCodeToVietEncoding(targetCode));
}

// 从code转换到icu的字符集名称
auto icuCharsetName = ToICUCharsetName(targetCode);

Expand Down Expand Up @@ -183,12 +193,29 @@ std::string Encode(std::u16string_view src, CharsetCode targetCode) {
// UTF16LE -> UTF8
std::string ret = Encode(temp, CharsetCode::UTF8);

throw runtime_error(GetLanguageService().GetUtf8String(StringId::WILL_LOST_CHARACTERS) + ret);
throw UnassignedCharError(ret);
}

return target;
}

std::string Convert(std::string_view src, ConvertParam inputParam) {
// 根据原编码得到Unicode字符串
std::u16string buf = Decode(src, inputParam.originCode);

// 如果需要转换换行符
if (inputParam.doConvertLineBreaks) {
ChangeLineBreaks(buf, inputParam.targetLineBreak);
}

// 转到目标编码
return Encode(buf, inputParam.targetCode);
}

viet::Encoding CharsetCodeToVietEncoding(CharsetCode code) noexcept {
return viet::to_encoding(to_utf8(ToViewCharsetName(code)));
}

Core::Core(std::tstring configFileName, CoreInitOption opt) : configFileName(configFileName), opt(opt) {
// 读ini
ReadConfigFromFile();
Expand All @@ -199,14 +226,17 @@ Core::Core(std::tstring configFileName, CoreInitOption opt) : configFileName(con
});

#ifndef NDEBUG
UErrorCode err;
auto allNames = ucnv_openAllNames(&err);
while (1) {
auto name = uenum_next(allNames, nullptr, &err);
if (name == nullptr) {
break;
}
}
// =================================
// ==== will detect memory leak ====
// UErrorCode err;
// UEnumeration *allNames = ucnv_openAllNames(&err);
// while (1) {
// auto name = uenum_next(allNames, nullptr, &err);
// if (name == nullptr) {
// break;
// }
//}
// ================================
#endif
}

Expand Down Expand Up @@ -378,11 +408,11 @@ void Core::Clear() {
listFileNames.clear();
}

Core::ConvertResult Core::Convert(const std::tstring &inputFilename, CharsetCode originCode,
LineBreaks originLineBreak) noexcept {
Core::ConvertFileResult Core::Convert(const std::tstring &inputFilename, CharsetCode originCode,
LineBreaks originLineBreak) noexcept {
CharsetCode targetCode = config.outputCharset;

ConvertResult ret;
ConvertFileResult ret;
try {
ret.outputFileName = inputFilename;
ret.targetLineBreaks = originLineBreak;
Expand Down Expand Up @@ -464,17 +494,26 @@ Core::ConvertResult Core::Convert(const std::tstring &inputFilename, CharsetCode
rawSize -= bomSize;
}

// 根据原编码得到Unicode字符串
auto buf = Decode(std::string_view(rawStart, rawSize), originCode);
ConvertParam param;
param.originCode = originCode;
param.targetCode = targetCode;
param.doConvertLineBreaks =
GetConfig().enableConvertLineBreaks && GetConfig().lineBreak != originLineBreak;
param.targetLineBreak = GetConfig().lineBreak;

// 如果需要转换换行符
if (GetConfig().enableConvertLineBreaks && GetConfig().lineBreak != originLineBreak) {
ChangeLineBreaks(buf, GetConfig().lineBreak);
ret.targetLineBreaks = GetConfig().lineBreak;
// 转到目标编码
std::string outputBuf;
try {
outputBuf = ::Convert(std::string_view(rawStart, rawSize), param);
} catch (const UnassignedCharError &err) {
throw std::runtime_error(GetLanguageService().GetUtf8String(StringId::WILL_LOST_CHARACTERS) +
err.what());
};

if (param.doConvertLineBreaks) {
ret.targetLineBreaks = param.targetLineBreak;
}

// 转到目标编码
auto outputBuf = Encode(buf, targetCode);
ret.outputFileSize = 0;

// 写入文件
Expand Down
38 changes: 34 additions & 4 deletions src/Core/Core.h
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
#include "CharsetCode.h"
#include "LineBreaks.h"
#include "Config.h"
#include "Vietnamese.h"

#include <tstring.h>

Expand All @@ -30,15 +31,44 @@
*/
std::u16string Decode(std::string_view src, CharsetCode code);

/**
* 不可分配字符错误
* 用于转换时出现不能转换到指定编码的情形。
* err.what()方法会返回不能转换的字符组成的字符串(utf-8编码)。
*/
class UnassignedCharError : public std::runtime_error {
public:
UnassignedCharError(const std::string &unassignedChars) : std::runtime_error(unassignedChars) {}
};

/**
* @brief 把unicode串编码为指定字符集
* @param src u16string(UTF-16LE)
* @return std::string CAUTION: this string is only as a container of char[] with the charset of targetCode.
* NOT mean its charset is ASCII or ANSI or others.
* @exception runtime_error ucnv出错/出现了不能转换的字符
* @exception viet::ConvertError
* @exception UnassignedCharError 出现了不能转换的字符
* @exception std::runtime_error ucnv出错
*/
std::string Encode(std::u16string_view src, CharsetCode targetCode);

struct ConvertParam {
CharsetCode originCode;
CharsetCode targetCode;
bool doConvertLineBreaks;
LineBreaks targetLineBreak; // target line break. if doConvertLineBreaks is false, this variable will be ignored.
};

/**
* Convert encoding.
* @exception viet::ConvertError
* @exception UnassignedCharError 出现了不能转换的字符
* @exception std::runtime_error ucnv出错
*/
std::string Convert(std::string_view src, ConvertParam inputParam);

viet::Encoding CharsetCodeToVietEncoding(CharsetCode code) noexcept;

class io_error_ignore : public std::runtime_error {
public:
io_error_ignore() : runtime_error("ignored") {}
Expand Down Expand Up @@ -99,7 +129,7 @@ class Core {

void Clear();

struct ConvertResult {
struct ConvertFileResult {
std::tstring outputFileName;
std::optional<std::string> errInfo;
LineBreaks targetLineBreaks;
Expand All @@ -110,8 +140,8 @@ class Core {
* @brief 转换一个文件。
* @return <输出文件的文件名, 出错信息>
*/
ConvertResult Convert(const std::tstring &inputFilename, CharsetCode originCode,
LineBreaks originLineBreak) noexcept;
ConvertFileResult Convert(const std::tstring &inputFilename, CharsetCode originCode,
LineBreaks originLineBreak) noexcept;

private:
std::tstring configFileName;
Expand Down
9 changes: 6 additions & 3 deletions src/Core/Detect.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,16 @@ std::tuple<std::string, int> DetectByUCharDet(uchardet *det, const char *buf, in

std::tuple<std::string, int> DetectByUCSDet(const char *buf, int bufSize) {
UErrorCode status = U_ZERO_ERROR;
UCharsetDetector *csd = ucsdet_open(&status);
auto csd =
std::unique_ptr<UCharsetDetector, void (*)(UCharsetDetector *)>(ucsdet_open(&status), [](UCharsetDetector *p) {
ucsdet_close(p);
});
DealWithUCNVError(status);

ucsdet_setText(csd, buf, bufSize, &status);
ucsdet_setText(csd.get(), buf, bufSize, &status);
DealWithUCNVError(status);

const UCharsetMatch *ucm = ucsdet_detect(csd, &status);
const UCharsetMatch *ucm = ucsdet_detect(csd.get(), &status);
DealWithUCNVError(status);

int32_t confidence = ucsdet_getConfidence(ucm, &status);
Expand Down
25 changes: 24 additions & 1 deletion src/Core/Vietnamese.h
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ inline std::string_view to_string(Encoding encoding) noexcept {
case Encoding::UTF8:
return "UTF8";
case Encoding::UTF16LE:
return "UTF16LE";
return "UTF-16LE";
case Encoding::VNI:
return "VNI";
case Encoding::VPS:
Expand All @@ -43,6 +43,29 @@ inline std::string_view to_string(Encoding encoding) noexcept {
return "";
}

inline Encoding to_encoding(std::string_view sv) noexcept {
if (sv == "UTF8") {
return Encoding::UTF8;
}
if (sv == "UTF-16LE") {
return Encoding::UTF8;
}
if (sv == "VNI") {
return Encoding::VNI;
}
if (sv == "VPS") {
return Encoding::VPS;
}
if (sv == "VISCII") {
return Encoding::VISCII;
}
if (sv == "TCVN3") {
return Encoding::TCVN3;
}
assert(0);
return Encoding::UTF8;
}

class ConvertError : public std::runtime_error {
public:
ConvertError(std::string content, int position, Encoding srcEncoding, Encoding destEncoding) noexcept;
Expand Down
Loading

0 comments on commit 8953f5b

Please sign in to comment.