Skip to content

Commit c041872

Browse files
committed
fix: add UTF-16 encoding detection and conversion to prevent assertion failures
Universal Ctags crashed with assertion failure in vStringPutImpl() when encountering files with UTF-16 encoding. The assertion `c >= 0 && c <= 0xff` failed because ctags expected all characters to fit within single byte range, but UTF-16 files contain multi-byte sequences that violate this assumption. This fix adds: - Detection of UTF-16 BOM (both LE and BE) in file reading - Automatic conversion from UTF-16 to UTF-8 using iconv when UTF-16 is detected - Force memory stream processing for UTF-16 files to enable conversion - Test cases for both UTF-16 LE and BE files Resolves issue #4342 Signed-off-by: Bernát Gábor <[email protected]>
1 parent d48558f commit c041872

File tree

5 files changed

+83
-2
lines changed

5 files changed

+83
-2
lines changed

Units/fuzz-hitting-assertions.r/issue-4342.d/expected.tags

Whitespace-only changes.
40 Bytes
Binary file not shown.
26 Bytes
Binary file not shown.
26 Bytes
Binary file not shown.

main/read.c

Lines changed: 83 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@
3434
#ifdef HAVE_ICONV
3535
# include "mbcs.h"
3636
# include "mbcs_p.h"
37+
# include <iconv.h>
3738
#endif
3839

3940
/*
@@ -881,8 +882,32 @@ static MIO *getMioFull (const char *const fileName, const char *const openMode,
881882
if (mtime)
882883
*mtime = st->mtime;
883884
eStatFree (st);
884-
if ((!memStreamRequired)
885-
&& (size > MAX_IN_MEMORY_FILE_SIZE || size == 0))
885+
886+
/* Always use memory stream for UTF-16 files to enable conversion */
887+
bool forceMemStream = false;
888+
#ifdef HAVE_ICONV
889+
if (size >= 2)
890+
{
891+
FILE *peek = fopen(fileName, openMode);
892+
if (peek)
893+
{
894+
unsigned char bom[2];
895+
if (fread(bom, 1, 2, peek) == 2)
896+
{
897+
/* Check for UTF-16 BOM */
898+
if ((bom[0] == 0xFF && bom[1] == 0xFE) ||
899+
(bom[0] == 0xFE && bom[1] == 0xFF))
900+
{
901+
forceMemStream = true;
902+
}
903+
}
904+
fclose(peek);
905+
}
906+
}
907+
#endif
908+
909+
if ((!memStreamRequired) && (!forceMemStream) &&
910+
(size > MAX_IN_MEMORY_FILE_SIZE || size == 0))
886911
return mio_new_file (fileName, openMode);
887912

888913
src = fopen (fileName, openMode);
@@ -900,6 +925,62 @@ static MIO *getMioFull (const char *const fileName, const char *const openMode,
900925
return mio_new_file (fileName, openMode);
901926
}
902927
fclose (src);
928+
929+
#ifdef HAVE_ICONV
930+
/* Check for UTF-16 BOM and convert to UTF-8 if found */
931+
if (size >= 2)
932+
{
933+
unsigned char *converted_data = NULL;
934+
unsigned long converted_size = 0;
935+
const char *encoding = NULL;
936+
937+
/* Check for UTF-16 LE BOM (FF FE) */
938+
if (data[0] == 0xFF && data[1] == 0xFE)
939+
{
940+
encoding = "UTF-16LE";
941+
}
942+
/* Check for UTF-16 BE BOM (FE FF) */
943+
else if (data[0] == 0xFE && data[1] == 0xFF)
944+
{
945+
encoding = "UTF-16BE";
946+
}
947+
948+
if (encoding != NULL)
949+
{
950+
/* Convert UTF-16 to UTF-8 */
951+
iconv_t cd = iconv_open("UTF-8", encoding);
952+
if (cd != (iconv_t)-1)
953+
{
954+
/* Skip BOM in input */
955+
char *inbuf = (char*)(data + 2);
956+
size_t inbytesleft = size - 2;
957+
958+
/* Allocate output buffer (UTF-8 can be up to 4 bytes per character) */
959+
size_t outbufsize = inbytesleft * 2;
960+
converted_data = eMalloc(outbufsize);
961+
char *outbuf = (char*)converted_data;
962+
size_t outbytesleft = outbufsize;
963+
964+
if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) != (size_t)-1)
965+
{
966+
converted_size = outbufsize - outbytesleft;
967+
eFree(data);
968+
data = converted_data;
969+
size = converted_size;
970+
converted_data = NULL; /* Prevent double free */
971+
}
972+
else
973+
{
974+
/* Conversion failed, fall back to original data */
975+
if (converted_data)
976+
eFree(converted_data);
977+
}
978+
iconv_close(cd);
979+
}
980+
}
981+
}
982+
#endif
983+
903984
return mio_new_memory (data, size, eRealloc, eFreeNoNullCheck);
904985
}
905986

0 commit comments

Comments
 (0)