[Bug] Invalid c++ code breaks monarch #4775

OfekShilon · 2024-12-07T17:58:38Z

Reproducible in vscode.dev or in VS Code Desktop?

Not reproducible in vscode.dev or VS Code Desktop

Reproducible in the monaco editor playground?

Not reproducible in the monaco editor playground

Monaco Editor Playground Link

https://microsoft.github.io/monaco-editor/monarch.html
Unfortunately the monarch playground does not supply full links. See full code and steps below.

Monaco Editor Playground Code

Paste into the 'Language Syntax Definition' the language from monaco's cpp.ts :

// Create your own language definition here
// You can safely look at other samples without losing modifications.
// Modifications are not saved on browser refresh/close though -- copy often!
return {
  // Set defaultToken to invalid to see what you do not tokenize yet
    defaultToken: "",
  tokenPostfix: ".cpp",
  brackets: [
    { token: "delimiter.curly", open: "{", close: "}" },
    { token: "delimiter.parenthesis", open: "(", close: ")" },
    { token: "delimiter.square", open: "[", close: "]" },
    { token: "delimiter.angle", open: "<", close: ">" }
  ],
  keywords: [
    "abstract",
    "amp",
    "array",
    "auto",
    "bool",
    "break",
    "case",
    "catch",
    "char",
    "class",
    "const",
    "constexpr",
    "const_cast",
    "continue",
    "cpu",
    "decltype",
    "default",
    "delegate",
    "delete",
    "do",
    "double",
    "dynamic_cast",
    "each",
    "else",
    "enum",
    "event",
    "explicit",
    "export",
    "extern",
    "false",
    "final",
    "finally",
    "float",
    "for",
    "friend",
    "gcnew",
    "generic",
    "goto",
    "if",
    "in",
    "initonly",
    "inline",
    "int",
    "interface",
    "interior_ptr",
    "internal",
    "literal",
    "long",
    "mutable",
    "namespace",
    "new",
    "noexcept",
    "nullptr",
    "__nullptr",
    "operator",
    "override",
    "partial",
    "pascal",
    "pin_ptr",
    "private",
    "property",
    "protected",
    "public",
    "ref",
    "register",
    "reinterpret_cast",
    "restrict",
    "return",
    "safe_cast",
    "sealed",
    "short",
    "signed",
    "sizeof",
    "static",
    "static_assert",
    "static_cast",
    "struct",
    "switch",
    "template",
    "this",
    "thread_local",
    "throw",
    "tile_static",
    "true",
    "try",
    "typedef",
    "typeid",
    "typename",
    "union",
    "unsigned",
    "using",
    "virtual",
    "void",
    "volatile",
    "wchar_t",
    "where",
    "while",
    "_asm",
    // reserved word with one underscores
    "_based",
    "_cdecl",
    "_declspec",
    "_fastcall",
    "_if_exists",
    "_if_not_exists",
    "_inline",
    "_multiple_inheritance",
    "_pascal",
    "_single_inheritance",
    "_stdcall",
    "_virtual_inheritance",
    "_w64",
    "__abstract",
    // reserved word with two underscores
    "__alignof",
    "__asm",
    "__assume",
    "__based",
    "__box",
    "__builtin_alignof",
    "__cdecl",
    "__clrcall",
    "__declspec",
    "__delegate",
    "__event",
    "__except",
    "__fastcall",
    "__finally",
    "__forceinline",
    "__gc",
    "__hook",
    "__identifier",
    "__if_exists",
    "__if_not_exists",
    "__inline",
    "__int128",
    "__int16",
    "__int32",
    "__int64",
    "__int8",
    "__interface",
    "__leave",
    "__m128",
    "__m128d",
    "__m128i",
    "__m256",
    "__m256d",
    "__m256i",
    "__m512",
    "__m512d",
    "__m512i",
    "__m64",
    "__multiple_inheritance",
    "__newslot",
    "__nogc",
    "__noop",
    "__nounwind",
    "__novtordisp",
    "__pascal",
    "__pin",
    "__pragma",
    "__property",
    "__ptr32",
    "__ptr64",
    "__raise",
    "__restrict",
    "__resume",
    "__sealed",
    "__single_inheritance",
    "__stdcall",
    "__super",
    "__thiscall",
    "__try",
    "__try_cast",
    "__typeof",
    "__unaligned",
    "__unhook",
    "__uuidof",
    "__value",
    "__virtual_inheritance",
    "__w64",
    "__wchar_t"
  ],
  operators: [
    "=",
    ">",
    "<",
    "!",
    "~",
    "?",
    ":",
    "==",
    "<=",
    ">=",
    "!=",
    "&&",
    "||",
    "++",
    "--",
    "+",
    "-",
    "*",
    "/",
    "&",
    "|",
    "^",
    "%",
    "<<",
    ">>",
    "+=",
    "-=",
    "*=",
    "/=",
    "&=",
    "|=",
    "^=",
    "%=",
    "<<=",
    ">>="
  ],
  // we include these common regular expressions
  symbols: /[=><!~?:&|+\-*\/\^%]+/,
  escapes: /\\(?:[0abfnrtv\\"']|x[0-9A-Fa-f]{1,4}|u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})/,
  integersuffix: /([uU](ll|LL|l|L)|(ll|LL|l|L)?[uU]?)/,
  floatsuffix: /[fFlL]?/,
  encoding: /u|u8|U|L/,
  // The main tokenizer for our languages
  tokenizer: {
    root: [
      // C++ 11 Raw String
      [/@encoding?R\"(?:([^ ()\\\t]*))\(/, { token: "string.raw.begin", next: "@raw.$1" }],
      // identifiers and keywords
      [
        /[a-zA-Z_]\w*/,
        {
          cases: {
            "@keywords": { token: "keyword.$0" },
            "@default": "identifier"
          }
        }
      ],
      // The preprocessor checks must be before whitespace as they check /^\s*#/ which
      // otherwise fails to match later after other whitespace has been removed.
      // Inclusion
      [/^\s*#\s*include/, { token: "keyword.directive.include", next: "@include" }],
      // Preprocessor directive
      [/^\s*#\s*\w+/, "keyword.directive"],
      // whitespace
      { include: "@whitespace" },
      // [[ attributes ]].
      [/\[\s*\[/, { token: "annotation", next: "@annotation" }],
      // delimiters and operators
      [/[{}()<>\[\]]/, "@brackets"],
      [
        /@symbols/,
        {
          cases: {
            "@operators": "delimiter",
            "@default": ""
          }
        }
      ],
      // numbers
      [/\d*\d+[eE]([\-+]?\d+)?(@floatsuffix)/, "number.float"],
      [/\d*\.\d+([eE][\-+]?\d+)?(@floatsuffix)/, "number.float"],
      [/0[xX][0-9a-fA-F']*[0-9a-fA-F](@integersuffix)/, "number.hex"],
      [/0[0-7']*[0-7](@integersuffix)/, "number.octal"],
      [/0[bB][0-1']*[0-1](@integersuffix)/, "number.binary"],
      [/\d[\d']*\d(@integersuffix)/, "number"],
      [/\d(@integersuffix)/, "number"],
      // delimiter: after number because of .\d floats
      [/[;,.]/, "delimiter"],
      // strings
      [/"([^"\\]|\\.)*$/, "string.invalid"],
      // non-teminated string
      [/"/, "string", "@string"],
      // characters
      [/'[^\\']'/, "string"],
      [/(')(@escapes)(')/, ["string", "string.escape", "string"]],
      [/'/, "string.invalid"]
    ],
    whitespace: [
      [/[ \t\r\n]+/, ""],
      [/\/\*\*(?!\/)/, "comment.doc", "@doccomment"],
      [/\/\*/, "comment", "@comment"],
      [/\/\/.*\\$/, "comment", "@linecomment"],
      [/\/\/.*$/, "comment"]
    ],
    comment: [
      [/[^\/*]+/, "comment"],
      [/\*\//, "comment", "@pop"],
      [/[\/*]/, "comment"]
    ],
    //For use with continuous line comments
    linecomment: [
      [/.*[^\\]$/, "comment", "@pop"],
      [/[^]+/, "comment"]
    ],
    //Identical copy of comment above, except for the addition of .doc
    doccomment: [
      [/[^\/*]+/, "comment.doc"],
      [/\*\//, "comment.doc", "@pop"],
      [/[\/*]/, "comment.doc"]
    ],
    string: [
      [/[^\\"]+/, "string"],
      [/@escapes/, "string.escape"],
      [/\\./, "string.escape.invalid"],
      [/"/, "string", "@pop"]
    ],
    raw: [
      [/[^)]+/, "string.raw"],
      [/\)$S2\"/, { token: "string.raw.end", next: "@pop" }],
      [/\)/, "string.raw"]
    ],
    annotation: [
      { include: "@whitespace" },
      [/using|alignas/, "keyword"],
      [/[a-zA-Z0-9_]+/, "annotation"],
      [/[,:]/, "delimiter"],
      [/[()]/, "@brackets"],
      [/\]\s*\]/, { token: "annotation", next: "@pop" }]
    ],
    include: [
      [
        /(\s*)(<)([^<>]*)(>)/,
        [
          "",
          "keyword.directive.include.begin",
          "string.include.identifier",
          { token: "keyword.directive.include.end", next: "@pop" }
        ]
      ],
      [
        /(\s*)(")([^"]*)(")/,
        [
          "",
          "keyword.directive.include.begin",
          "string.include.identifier",
          { token: "keyword.directive.include.end", next: "@pop" }
        ]
      ]
    ]
  },
};

And paste into the right 'Language Editor' pane :

R"[())"

Reproduction Steps

Stand anywhere on the Language-Editor line (R"[())"), click F1 and select 'Inspect Tokens', and you'd see the internal exception:

This originates in a reported compiler-explorer bug.

Actual (Problematic) Behavior

Typical exception stack (from monarch playground):

errors.ts:26  Uncaught Error: Invalid regular expression: /^(?:\)[\")/: Unterminated character class

SyntaxError: Invalid regular expression: /^(?:\)[\")/: Unterminated character class
    at new RegExp (<anonymous>)
    at Rule.regex (monarchCompile.ts:132:17)
    at Rule.resolveRegex (monarchCompile.ts:417:16)
    at MonarchTokenizer._myTokenize (monarchLexer.ts:646:39)
    at MonarchTokenizer._tokenize (monarchLexer.ts:500:16)
    at MonarchTokenizer.tokenizeEncoded (monarchLexer.ts:492:29)
    at safeTokenize (textModelTokens.ts:403:28)
    at TokenizerWithStateStoreAndTextModel.updateTokensUntilLine (textModelTokens.ts:69:14)
    at DefaultBackgroundTokenizer._tokenizeOneInvalidLine (textModelTokens.ts:515:33)
    at DefaultBackgroundTokenizer._backgroundTokenizeForAtLeast1ms (textModelTokens.ts:492:37)
    at new RegExp (<anonymous>)
    at Rule.regex (monarchCompile.ts:132:17)
    at Rule.resolveRegex (monarchCompile.ts:417:16)
    at MonarchTokenizer._myTokenize (monarchLexer.ts:646:39)
    at MonarchTokenizer._tokenize (monarchLexer.ts:500:16)
    at MonarchTokenizer.tokenizeEncoded (monarchLexer.ts:492:29)
    at safeTokenize (textModelTokens.ts:403:28)
    at TokenizerWithStateStoreAndTextModel.updateTokensUntilLine (textModelTokens.ts:69:14)
    at DefaultBackgroundTokenizer._tokenizeOneInvalidLine (textModelTokens.ts:515:33)
    at DefaultBackgroundTokenizer._backgroundTokenizeForAtLeast1ms (textModelTokens.ts:492:37)
    at errors.ts:26:12

Expected Behavior

Graceful failure, perhaps not using user-supplied code as part of a regex.

Additional Context

No response

The text was updated successfully, but these errors were encountered:

OfekShilon mentioned this issue Dec 7, 2024

Monaco invalid regular expression compiler-explorer/compiler-explorer#7178

Open

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

[Bug] Invalid c++ code breaks monarch #4775

[Bug] Invalid c++ code breaks monarch #4775

OfekShilon commented Dec 7, 2024 •

edited

Loading

[Bug] Invalid c++ code breaks monarch #4775

[Bug] Invalid c++ code breaks monarch #4775

Comments

OfekShilon commented Dec 7, 2024 • edited Loading

Reproducible in vscode.dev or in VS Code Desktop?

Reproducible in the monaco editor playground?

Monaco Editor Playground Link

Monaco Editor Playground Code

Reproduction Steps

Actual (Problematic) Behavior

Expected Behavior

Additional Context

OfekShilon commented Dec 7, 2024 •

edited

Loading