From f0485f8b3e519f345565e7c635da86fd027fdff2 Mon Sep 17 00:00:00 2001
From: Zakhar Bessarab <z.bessarab@victoriametrics.com>
Date: Thu, 29 Feb 2024 19:46:09 +0400
Subject: [PATCH] strings extraction: extract values with custom UTF encoding
 from strings

Previously, only quoted values of UTF escaped sequences were converted into UTF runes due to Go strings semantics. Also, it would not decode values with non-standard Go escaping.

This change adds additional decoding logic to always convert escape sequence into character.

See also: https://github.com/VictoriaMetrics/VictoriaMetrics/issues/5519
---
 lexer.go       | 68 +++++++++++++++++++++++++++++++++++++++++++++++++-
 parser.go      |  3 +++
 parser_test.go |  6 +++++
 3 files changed, 76 insertions(+), 1 deletion(-)

diff --git a/lexer.go b/lexer.go
index da6a47e..bbee36b 100644
--- a/lexer.go
+++ b/lexer.go
@@ -387,6 +387,37 @@ func unescapeIdent(s string) string {
 	}
 }
 
+func unescapeUTFSymbols(s string) string {
+	n := strings.IndexByte(s, '\\')
+	if n < 0 {
+		return s
+	}
+	dst := make([]byte, 0, len(s))
+	for {
+		dst = append(dst, s[:n]...)
+		s = s[n+1:]
+		if isUTFEscapePrefix(s) {
+			r, size := decodeUTFEscapeSequence(s)
+			if r == utf8.RuneError {
+				// Cannot decode escape sequence. Put it in the output as is
+				dst = append(dst, '\\')
+			} else {
+				dst = utf8.AppendRune(dst, r)
+				s = s[size:]
+			}
+		} else {
+			// Save non-UTF escape sequence as is
+			dst = append(dst, '\\')
+		}
+
+		n = strings.IndexByte(s, '\\')
+		if n < 0 {
+			dst = append(dst, s...)
+			return string(dst)
+		}
+	}
+}
+
 func appendEscapedIdent(dst []byte, s string) []byte {
 	i := 0
 	for i < len(s) {
@@ -728,7 +759,33 @@ func appendEscapeSequence(dst []byte, r rune) []byte {
 	return append(dst, 'u', toHex(byte(r>>12)), toHex(byte((r>>8)&0xf)), toHex(byte(r>>4)), toHex(byte(r&0xf)))
 }
 
-func decodeEscapeSequence(s string) (rune, int) {
+// checks if string has one of supported escape sequences
+// supported: \x, \X, \u, \U
+func hasUTFEscapedSymbols(s string) bool {
+	for i := 0; i < len(s); i++ {
+		if s[i] == '\\' && i+1 < len(s) {
+			switch s[i+1] {
+			case 'x', 'X', 'u', 'U':
+				return true
+			}
+		}
+	}
+	return false
+}
+
+func isUTFEscapePrefix(s string) bool {
+	if len(s) == 0 {
+		return false
+	}
+
+	switch s[0] {
+	case 'x', 'X', 'u', 'U':
+		return true
+	}
+	return false
+}
+
+func decodeUTFEscapeSequence(s string) (rune, int) {
 	if strings.HasPrefix(s, "x") || strings.HasPrefix(s, "X") {
 		if len(s) >= 3 {
 			h1 := fromHex(s[1])
@@ -752,6 +809,15 @@ func decodeEscapeSequence(s string) (rune, int) {
 		}
 		return utf8.RuneError, 0
 	}
+	// Improperly escaped non-printable char
+	return utf8.RuneError, 0
+}
+
+func decodeEscapeSequence(s string) (rune, int) {
+	if isUTFEscapePrefix(s) {
+		return decodeUTFEscapeSequence(s)
+	}
+
 	r, size := utf8.DecodeRuneInString(s)
 	if unicode.IsPrint(r) {
 		return r, size
diff --git a/parser.go b/parser.go
index ddd39c8..d78454c 100644
--- a/parser.go
+++ b/parser.go
@@ -1111,6 +1111,9 @@ func extractStringValue(token string) (string, error) {
 		token = strings.Replace(token, `"`, `\"`, -1)
 		token = `"` + token + `"`
 	}
+	if hasUTFEscapedSymbols(token) {
+		token = unescapeUTFSymbols(token)
+	}
 	s, err := strconv.Unquote(token)
 	if err != nil {
 		return "", fmt.Errorf(`cannot parse string literal %q: %s`, token, err)
diff --git a/parser_test.go b/parser_test.go
index 99c782d..953ec1c 100644
--- a/parser_test.go
+++ b/parser_test.go
@@ -132,6 +132,12 @@ func TestParseSuccess(t *testing.T) {
 	another(`\温\度{\房\间="水电费"}[5m] offset 10m`, `温度{房间="水电费"}[5m] offset 10m`)
 	same(`sum(fo\|o) by(b\|a,x)`)
 	another(`sum(x) by (b\x7Ca)`, `sum(x) by(b\|a)`)
+	another(`fo\xF3`, `foó`)
+	another(`fo\u00F3`, `foó`)
+	another(`{__name__="fo\xF3"}`, `foó`)
+	another(`{__name__="fo\xF3"}`, `foó`)
+	another(`"\n\tfo\xF3"`, `"\n\tfoó"`)
+	another(`温度{房间="水电费\xF3"}[5m] offset 10m`, `温度{房间="水电费ó"}[5m] offset 10m`)
 
 	// Duplicate filters
 	same(`foo{__name__="bar"}`)