From e6e7de4972731d42a3f12206859371d6c74b8e2c Mon Sep 17 00:00:00 2001 From: Shannon Rae <166186361+secretlyshannon@users.noreply.github.com> Date: Mon, 28 Apr 2025 14:36:16 -0700 Subject: [PATCH 1/6] Update compiler_test.go --- compiler/compiler_test.go | 270 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 270 insertions(+) diff --git a/compiler/compiler_test.go b/compiler/compiler_test.go index a97081165..301eef1d3 100644 --- a/compiler/compiler_test.go +++ b/compiler/compiler_test.go @@ -8,6 +8,7 @@ import ( "strings" "testing" + "github.com/inspirer/textmapper/grammar/grammar" "github.com/inspirer/textmapper/parsers/parsertest" "github.com/inspirer/textmapper/parsers/tm" "github.com/inspirer/textmapper/parsers/tm/ast" @@ -36,6 +37,7 @@ var testFiles = []string{ "inject.tmerr", "flexmode.tmerr", "max_la.tmerr", + "disabled_syntax.tmerr", } func TestErrors(t *testing.T) { @@ -191,3 +193,271 @@ func TestDebugInfo(t *testing.T) { } } } + +func TestArgRef(t *testing.T) { + header := ` + language medium(cc); + + namespace = "medium" + + :: lexer + + KW_A: /a/ + KW_B: /b/ + KW_C: /c/ + KW_D: /d/ + ',': /,/ + + :: parser + + %input Z; + + a {std::string}: KW_A; + b {int}: KW_B; + c {int*}: KW_C; + d {double}: KW_D; + ` + + testCases := []struct { + input string + // If not provided, default to the start symbol "Z". + symbol string + want [][]string + wantMulti [][][]string + }{ + // Section: Optional. + { + input: "Z: a? {};", + want: [][]string{{`$1[a]?`}}, + }, + { + input: `Z: c b?;`, + // No arg refs are collected because there are no semantic actions. + want: [][]string{}, + }, + { + input: `Z: (a b)? {};`, + want: [][]string{{`$1[a]?`, `$2[b]?`}}, + }, + // Mid rule. + { + input: `Z: b? {} c;`, + // Only b is collected because c is after the mid rule. + want: [][]string{{`$1[b]?`}}, + }, + // Mid rule and semantic action. + { + input: `Z: b? {} c {};`, + // For the mid rule, only b is collected; for the semantic action, both b and c are collected. + want: [][]string{{`$1[b]?`}, {`$1[b]?`, `$2[c]`}}, + }, + // Duplicate symbol names. + { + input: `Z: (a a)? {};`, + want: [][]string{{`$1[a]?`, `$2[a]?`}}, + }, + // Duplicate symbol names. + { + input: `Z: a? a {};`, + want: [][]string{{`$1[a]?`, `$2[a]`}}, + }, + // Optional terminal. + { + input: `Z: KW_A? {};`, + want: [][]string{{`$1[KW_A]?`}}, + }, + // With state marker. + { + input: `Z: a? .my_state b {};`, + want: [][]string{{`$1[a]?`, `$2[b]`}}, + }, + // Section: List + { + input: `Z: a+ {};`, + want: [][]string{{`$1[a_list]`}}, + }, + { + input: `Z: a* {};`, + want: [][]string{{`$1[a_optlist]`}}, + }, + // List with separator. + { + input: `Z: (a separator ',')+ {};`, + want: [][]string{{`$1[a_list_Comma_separated]`}}, + }, + // List with separator. + { + input: `Z: (a separator ',')* {};`, + want: [][]string{{`$1[a_list_Comma_separatedopt]`}}, + }, + // List of terminals. + { + input: `Z: KW_A+ {};`, + want: [][]string{{`$1[KWA_list]`}}, + }, + // Semantic action inside list. + { + input: `Z: ( a {} )+ {};`, + symbol: "a_list", + want: [][]string{{`$1[a]`}}, + }, + // Section: Alternating group. + { + input: `Z: (a | b) {};`, + want: [][]string{{`$1[a]?`, `$2[b]?`}}, + }, + { + // Commands in alternating groups. + // cmd1 only has access to b, and cmd2 only has access to c. cmd has access to all a, b, and + // c. + input: `Z: a ( b {cmd1} | c {cmd2} ) {cmd3};`, + wantMulti: [][][]string{ + // cmd1 + {{`$2[b]?`}, {`$1[a]`, `$2[b]?`, `$3[c]?`}}, + // cmd2 + {{`$3[c]?`}, {`$1[a]`, `$2[b]?`, `$3[c]?`}}, + }, + }, + // Terminals in alternating group. + { + input: `Z: (KW_A | KW_B) {};`, + want: [][]string{{`$1[KW_A]?`, `$2[KW_B]?`}}, + }, + // Section: Nested syntax extensions. + { + input: `Z: (a? | b) {};`, + want: [][]string{{`$1[a]?`, `$2[b]?`}}, + }, + { + input: `Z: (a+ | b*) {};`, + want: [][]string{{`$1[a_list]?`, `$2[b_optlist]?`}}, + }, + { + // Nested choice inside a list + input: `Z: ( a {cmd1} | b {cmd2} )+ {cmd3};`, + want: [][]string{{`$1[Z$1]`}}, + }, + { + // List inside nested choice + input: `Z: (a+ {cmd1} | (b{cmd2})* ) {};`, + wantMulti: [][][]string{ + // (a+ {cmd1}) {}; + {{`$1[a_list]?`}, {`$1[a_list]?`, `$2[b_optlist]?`}}, + // b{cmd2}* {}; + {{`$1[a_list]?`, `$2[b_optlist]?`}}, + }, + }, + { + input: `Z: (a? b)+ {};`, + want: [][]string{{`$1[Z$1]`}}, + }, + { + input: `Z: (a | b)* {};`, + want: [][]string{{`$1[Z$1]`}}, + }, + // Section: Set. + { + input: `Z: set(KW_A | KW_B) {};`, + want: [][]string{{`$1[setof_KW_A_or_KW_B]`}}, + }, + } + + for _, tc := range testCases { + input := header + tc.input + parsed, err := parseToGrammar(input) + + if err != nil { + t.Fatalf("cannot parse %q: %v", input, err) + } + + nts := parsed.Parser.Nonterms + + var sym string + if tc.symbol != "" { + sym = tc.symbol + } else { + sym = "Z" + } + nt := getNt(nts, sym) + if nt == nil { + t.Fatalf("cannot find the start symbol Z") + } + rules := []*syntax.Expr{nt.Value} + if nt.Value.Kind == syntax.Choice { + rules = nt.Value.Sub + } + + for i, rule := range rules { + got := gotArgRefs(rule, parsed) + var want string + if tc.wantMulti != nil { + want = fmt.Sprintf("%+v", tc.wantMulti[i]) + } else { + want = fmt.Sprintf("%+v", tc.want) + } + if got != want { + t.Errorf("got %v, want %v for input %q", got, want, tc.input) + } + } + } +} + +// A convenience function to parse a grammar string and return the corresponding model. +func parseToGrammar(content string) (*grammar.Grammar, error) { + ctx := context.Background() + filename := "test.tm" + _, err := ast.Parse(ctx, filename, content, tm.StopOnFirstError) + if err != nil { + return nil, fmt.Errorf("%v: parsing failed with %v", filename, err) + } + + return Compile(ctx, filename, content, Params{DebugTables: true}) +} + +func getNt(nts []*syntax.Nonterm, name string) *syntax.Nonterm { + for _, nt := range nts { + if nt.Name == name { + return nt + } + } + return nil +} + +func serializeArgRef(ref syntax.ArgRef, grammar *grammar.Grammar) string { + ret := fmt.Sprintf("$%v[%v]", ref.Pos, grammar.Syms[ref.Symbol].Name) + if ref.Optional { + ret += "?" + } + return ret +} + +func serializeArgRefs(refs map[int]syntax.ArgRef, grammar *grammar.Grammar) string { + var keys []int + for k := range refs { + keys = append(keys, k) + } + sort.Ints(keys) + + var ret []string + for _, pos := range keys { + ret = append(ret, serializeArgRef(refs[pos], grammar)) + } + return "[" + strings.Join(ret, " ") + "]" +} + +func gotArgRefs(e *syntax.Expr, grammar *grammar.Grammar) string { + var collect func(e *syntax.Expr) + collected := make([]map[int]syntax.ArgRef, 0) + collect = func(e *syntax.Expr) { + if e.CmdArgs != nil && e.CmdArgs.ArgRefs != nil { + collected = append(collected, e.CmdArgs.ArgRefs) + } + } + + e.ForEach(-1, collect) + var ret []string + for _, argRefs := range collected { + ret = append(ret, serializeArgRefs(argRefs, grammar)) + } + return "[" + strings.Join(ret, " ") + "]" +} From 5893d5778275ef4e371d7adadd263f40ecaa5e6d Mon Sep 17 00:00:00 2001 From: Shannon Rae <166186361+secretlyshannon@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:16:07 -0700 Subject: [PATCH 2/6] Upstream changes to enables C++ semantic actions to co-exist with optional, choice, and list syntaxes for ZetaSQL --- compiler/compiler.go | 80 ++++++--- compiler/options.go | 2 + compiler/syntax.go | 253 +++++++++++++++++++++++------ gen/funcs.go | 185 ++++++++++++++------- gen/funcs_test.go | 46 +++--- gen/templates/cc_parser_cc.go.tmpl | 29 ++-- gen/templates/cc_parser_h.go.tmpl | 19 ++- grammar/grammar.go | 85 ++++++++-- syntax/expand.go | 253 +++++++++++++++++++++++++++-- syntax/set_test.go | 2 +- syntax/syntax.go | 68 +++++++- syntax/syntax_test.go | 28 ++-- 12 files changed, 820 insertions(+), 230 deletions(-) diff --git a/compiler/compiler.go b/compiler/compiler.go index c94031d99..605d164dd 100644 --- a/compiler/compiler.go +++ b/compiler/compiler.go @@ -211,14 +211,42 @@ func checkLookaheads(m *syntax.Model, maxSize int) error { return s.Err() } +func checkSyntaxes(m *syntax.Model, opts *grammar.Options) error { + var s status.Status + disabled := make(map[string]bool) + for _, kind := range opts.DisableSyntax { + disabled[kind] = true + } + var visit func(e *syntax.Expr, top bool) + visit = func(e *syntax.Expr, top bool) { + if disabled["NestedChoice"] && e.Kind == syntax.Choice && !top { + s.Errorf(e.Origin, "parenthesized Choice operator is not supported") + } + var kind = e.Kind.GoString() + if disabled[kind] { + s.Errorf(e.Origin, "syntax %v is not supported", kind) + } + for _, sub := range e.Sub { + visit(sub, false) + } + } + for _, nt := range m.Nonterms { + if len(nt.Params) > 0 && disabled["Templates"] { + s.Errorf(nt.Origin, "templates are not supported") + } + visit(nt.Value, true) + } + return s.Err() +} + func (c *compiler) compileParser(file ast.File) { p, ok := file.Parser() if !ok || !c.out.Options.GenParser { // Lexer-only grammar. return } - - loader := newSyntaxLoader(c.resolver, c.out.Options, c.Status) + target, _ := file.Header().Target() + loader := newSyntaxLoader(c.resolver, target.Text(), c.out.Options, c.Status) loader.load(p, file.Header()) if c.Err() != nil { // Parsing errors cause inconsistencies inside c.source. Aborting. @@ -228,6 +256,15 @@ func (c *compiler) compileParser(file ast.File) { c.out.Parser.Prec = loader.prec source := loader.out + + if len(c.out.Options.DisableSyntax) > 0 { + err := checkSyntaxes(source, c.out.Options) + if err != nil { + c.AddError(err) + return + } + } + if err := syntax.PropagateLookaheads(source); err != nil { c.AddError(err) return @@ -273,7 +310,7 @@ func (c *compiler) compileParser(file ast.File) { } } - if err := syntax.Expand(source); err != nil { + if err := syntax.Expand(source, loader.expandOpts); err != nil { c.AddError(err) return } @@ -487,15 +524,11 @@ func generateTables(source *syntax.Model, out *grammar.Grammar, opts genOptions, for _, r := range rule.RHS { if r.IsStateMarker() { s.Errorf(origin, "mixing mid-rule actions with state markers is not supported") - continue - } - if int(r) < len(out.Syms) { - vars.Types = append(vars.Types, out.Syms[r].Type) } else { - // No types for extracted commands. - vars.Types = append(vars.Types, "") + vars.SymRefCount++ } } + addTypes(vars, out.Syms) } cmdNT := midrule.extract(nt, command, vars, cmdOrigin) rule.RHS = append(rule.RHS, cmdNT) @@ -549,16 +582,11 @@ func generateTables(source *syntax.Model, out *grammar.Grammar, opts genOptions, if args != nil { act.Vars = &grammar.ActionVars{CmdArgs: *args, Remap: actualPos} for _, r := range rule.RHS { - if r.IsStateMarker() { - continue - } - if int(r) < len(out.Syms) { - act.Vars.Types = append(act.Vars.Types, out.Syms[r].Type) - } else { - // No types for extracted commands. - act.Vars.Types = append(act.Vars.Types, "") + if !r.IsStateMarker() { + act.Vars.SymRefCount++ } } + addTypes(act.Vars, out.Syms) act.Vars.LHSType = out.Syms[rule.LHS].Type } rule.Action = len(parser.Actions) @@ -584,6 +612,20 @@ func generateTables(source *syntax.Model, out *grammar.Grammar, opts genOptions, return err } +// addTypes updates the `Types` field of the given action variables using the type information +// from the given symbols `syms`. +func addTypes(vars *grammar.ActionVars, syms []grammar.Symbol) { + vars.Types = make(map[int]string) + for _, ref := range vars.CmdArgs.ArgRefs { + if ref.Symbol < len(syms) { + vars.Types[ref.Pos] = syms[ref.Symbol].Type + } else { + // No types for extracted commands. + vars.Types[ref.Pos] = "" + } + } +} + type commandExtractor struct { baseSyms int takenName map[string]bool @@ -606,7 +648,7 @@ type commandKey struct { func newCommandExtractor(m *syntax.Model, baseSyms int) *commandExtractor { taken := make(map[string]bool) for _, t := range m.Terminals { - taken[t] = true + taken[t.Name] = true } for _, p := range m.Params { taken[p.Name] = true @@ -645,7 +687,7 @@ func (e *commandExtractor) extract(n *syntax.Nonterm, command string, vars *gram // Give a hint to the code generator that this rule's rhs starts // earlier in the stack. - args.Delta = -len(vars.Types) + args.Delta = -vars.SymRefCount // Make a copy. copy := *vars diff --git a/compiler/options.go b/compiler/options.go index 7645458a2..5e0f5f66e 100644 --- a/compiler/options.go +++ b/compiler/options.go @@ -103,6 +103,8 @@ func (p *optionsParser) parseFrom(file ast.File) { opts.NoEmptyRules = p.parseExpr(opt.Value(), opts.NoEmptyRules).(bool) case "maxLookahead": opts.MaxLookahead = p.parseExpr(opt.Value(), opts.MaxLookahead).(int) + case "disableSyntax": + opts.DisableSyntax = p.parseExpr(opt.Value(), opts.DisableSyntax).([]string) case "eventFields": p.validLangs(opt.Key(), "go") opts.EventFields = p.parseExpr(opt.Value(), opts.EventFields).(bool) diff --git a/compiler/syntax.go b/compiler/syntax.go index 061078d8b..ca82bd939 100644 --- a/compiler/syntax.go +++ b/compiler/syntax.go @@ -36,21 +36,31 @@ type syntaxLoader struct { nonterms map[string]int // -> index in source.Nonterms cats map[string]int // -> index in source.Cats paramPerm []int // for parameter permutations - rhsPos int // Counter for positional index of a reference in the current rule. - rhsNames map[string]int + ruleStack []*rhsRule + + expandOpts *syntax.ExpandOptions } -func newSyntaxLoader(resolver *resolver, opts *grammar.Options, s *status.Status) *syntaxLoader { +func newSyntaxLoader(resolver *resolver, targetLang string, opts *grammar.Options, s *status.Status) *syntaxLoader { + var expandOpts *syntax.ExpandOptions + switch targetLang { + case "cc": + expandOpts = syntax.CcExpandOptions() + default: + expandOpts = &syntax.ExpandOptions{} + } + return &syntaxLoader{ resolver: resolver, noEmptyRules: opts.NoEmptyRules, optSuffix: opts.OptInstantiationSuffix, Status: s, - namedSets: make(map[string]int), - params: make(map[string]int), - nonterms: make(map[string]int), - cats: make(map[string]int), + namedSets: make(map[string]int), + params: make(map[string]int), + nonterms: make(map[string]int), + cats: make(map[string]int), + expandOpts: expandOpts, } } @@ -497,13 +507,24 @@ func (c *syntaxLoader) instantiateOpt(name string, origin ast.Symref) (int, bool var ref *syntax.Expr target := strings.TrimSuffix(name, c.optSuffix) + var sym int + var symType string if index, ok := c.resolver.syms[target]; ok { - nt.Type = c.resolver.Syms[index].Type - ref = &syntax.Expr{Kind: syntax.Reference, Symbol: index, Origin: origin, Model: c.out} + // Opt-terminal is also supported, e.g. KW_Aopt for KW_A. + sym = index + symType = c.resolver.Syms[sym].Type + if c.expandOpts.OptionalType != nil { + nt.Type = c.expandOpts.OptionalType(symType) + } + ref = &syntax.Expr{Kind: syntax.Reference, Symbol: sym, Origin: origin, Model: c.out, Pos: 1} } else if nonterm, ok := c.nonterms[target]; ok { - nt.Type = c.out.Nonterms[nonterm].Type + sym = c.resolver.NumTokens + nonterm + symType = c.out.Nonterms[nonterm].Type + if c.expandOpts.OptionalType != nil { + nt.Type = c.expandOpts.OptionalType(symType) + } nt.Params = c.out.Nonterms[nonterm].Params - ref = &syntax.Expr{Kind: syntax.Reference, Symbol: c.resolver.NumTokens + nonterm, Origin: origin, Model: c.out} + ref = &syntax.Expr{Kind: syntax.Reference, Symbol: sym, Origin: origin, Model: c.out, Pos: 1} for _, param := range nt.Params { ref.Args = append(ref.Args, syntax.Arg{Param: param, TakeFrom: param}) } @@ -513,6 +534,15 @@ func (c *syntaxLoader) instantiateOpt(name string, origin ast.Symref) (int, bool } nt.Value = &syntax.Expr{Kind: syntax.Optional, Sub: []*syntax.Expr{ref}, Origin: origin} + if nt.Type != "" && c.expandOpts.OptionalCmd != nil { + refs := map[int]syntax.ArgRef{ + 1: syntax.ArgRef{Pos: 1, Kind: "reference", Optional: true, Symbol: sym}, + } + cmdArgs := &syntax.CmdArgs{MaxPos: 2, Names: map[string]int{target: 1}, ArgRefs: refs} + cmd := &syntax.Expr{Kind: syntax.Command, Name: c.expandOpts.OptionalCmd(symType), CmdArgs: cmdArgs, Origin: origin} + nt.Value = &syntax.Expr{Kind: syntax.Sequence, Sub: []*syntax.Expr{nt.Value, cmd}, Origin: origin} + } + c.nonterms[name] = len(c.out.Nonterms) index := c.resolver.NumTokens + len(c.out.Nonterms) c.out.Nonterms = append(c.out.Nonterms, nt) @@ -727,60 +757,79 @@ func (c *syntaxLoader) convertSeparator(sep ast.ListSeparator) *syntax.Expr { } } -func (c *syntaxLoader) allocatePos() int { - ret := c.rhsPos - c.rhsPos++ - return ret +func (c *syntaxLoader) allocatePos(underOpts bool, kind string, sym int) int { + rule := c.currentRule() + pos := rule.nextPos() + rule.incPos() + ref := syntax.ArgRef{Pos: pos, Kind: kind, Optional: underOpts, Symbol: sym} + rule.argRefs = append(rule.argRefs, ref) + return pos } func (c *syntaxLoader) pushName(name string, pos int) { - if c.rhsNames == nil { - c.rhsNames = make(map[string]int) + rule := c.currentRule() + // Names need to be unique across the top-level rule. + var topNames map[string]int + names := rule.names + if rule.top == nil { + topNames = rule.names + } else { + topNames = rule.top.names } var index int - if _, ok := c.rhsNames[name+"#0"]; ok { + if _, ok := topNames[name+"#0"]; ok { for { index++ - if _, ok := c.rhsNames[fmt.Sprintf("%v#%v", name, index)]; !ok { + if _, ok := topNames[fmt.Sprintf("%v#%v", name, index)]; !ok { break } } - } else if val, ok := c.rhsNames[name]; ok { - c.rhsNames[name+"#0"] = val - delete(c.rhsNames, name) + } else if val, ok := topNames[name]; ok { + topNames[name+"#0"] = val + names[name+"#0"] = val + delete(topNames, name) + delete(names, name) index = 1 } if index > 0 { name = fmt.Sprintf("%v#%v", name, index) } - c.rhsNames[name] = pos + topNames[name] = pos + names[name] = pos } -func (c *syntaxLoader) convertPart(p ast.RhsPart, nonterm *syntax.Nonterm) *syntax.Expr { +func (c *syntaxLoader) convertPart(p ast.RhsPart, nonterm *syntax.Nonterm, underOpts bool) *syntax.Expr { + rhs := c.currentRule() switch p := p.(type) { case *ast.Command: - args := &syntax.CmdArgs{MaxPos: c.rhsPos} - if len(c.rhsNames) > 0 { + args := &syntax.CmdArgs{MaxPos: rhs.nextPos()} + if len(rhs.names) > 0 { // Only names and references preceding the command are available to its code. // Note: the list below can include entities from a different alternative but // they'll be automatically filtered later on. args.Names = make(map[string]int) - for k, v := range c.rhsNames { + for k, v := range rhs.names { args.Names[k] = v } } + if len(rhs.argRefs) > 0 { + args.ArgRefs = make(map[int]syntax.ArgRef) + for _, argRef := range rhs.argRefs { + args.ArgRefs[argRef.Pos] = argRef + } + } text := p.Text() return &syntax.Expr{Kind: syntax.Command, Name: text, CmdArgs: args, Origin: p} case *ast.RhsAssignment: - inner := c.convertPart(p.Inner(), nonterm) + inner := c.convertPart(p.Inner(), nonterm, underOpts) name := p.Id().Text() subs := []*syntax.Expr{inner} return &syntax.Expr{Kind: syntax.Assign, Name: name, Sub: subs, Origin: p} case *ast.RhsPlusAssignment: - subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm)} + subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm, underOpts)} return &syntax.Expr{Kind: syntax.Append, Name: p.Id().Text(), Sub: subs, Origin: p} case *ast.RhsAlias: - ret := c.convertPart(p.Inner(), nonterm) + ret := c.convertPart(p.Inner(), nonterm, underOpts) name := p.Name().Text() if ret.Pos > 0 { @@ -808,39 +857,50 @@ func (c *syntaxLoader) convertPart(p ast.RhsPart, nonterm *syntax.Nonterm) *synt } return &syntax.Expr{Kind: syntax.Lookahead, Sub: subs, Origin: p} case *ast.RhsNested: - return c.convertRules(p.Rule0(), nonterm, report{} /*defaultReport*/, false /*topLevel*/, p) + return c.convertRules(p.Rule0(), nonterm, report{} /*defaultReport*/, false /*topLevel*/, underOpts, p) case *ast.RhsOptional: - subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm)} + subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm, true /*underOpts*/)} return &syntax.Expr{Kind: syntax.Optional, Sub: subs, Origin: p} case *ast.RhsPlusList: - seq := c.convertSequence(p.RuleParts(), nonterm, false /*topLevel*/, p) + c.pushRule(true /*topLevel*/) + seq := c.convertSequence(p.RuleParts(), nonterm, false /*topLevel*/, underOpts, p) + c.popRule() subs := []*syntax.Expr{seq} if sep := c.convertSeparator(p.ListSeparator()); sep.Kind != syntax.Empty { subs = []*syntax.Expr{seq, sep} } - return &syntax.Expr{Kind: syntax.List, Sub: subs, ListFlags: syntax.OneOrMore, Pos: c.allocatePos(), Origin: p} + return &syntax.Expr{Kind: syntax.List, Sub: subs, ListFlags: syntax.OneOrMore, Pos: c.allocatePos(underOpts, "plusList", -1 /*sym*/), Origin: p} case *ast.RhsStarList: - seq := c.convertSequence(p.RuleParts(), nonterm, false /*topLevel*/, p) + c.pushRule(true /*topLevel*/) + seq := c.convertSequence(p.RuleParts(), nonterm, false /*topLevel*/, underOpts, p) + c.popRule() subs := []*syntax.Expr{seq} if sep := c.convertSeparator(p.ListSeparator()); sep.Kind != syntax.Empty { subs = []*syntax.Expr{seq, sep} } - return &syntax.Expr{Kind: syntax.List, Sub: subs, Pos: c.allocatePos(), Origin: p} + return &syntax.Expr{Kind: syntax.List, Sub: subs, Pos: c.allocatePos(underOpts, "starList", -1 /*sym*/), Origin: p} case *ast.RhsPlusQuantifier: - subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm)} - return &syntax.Expr{Kind: syntax.List, Sub: subs, ListFlags: syntax.OneOrMore, Pos: c.allocatePos(), Origin: p} + c.pushRule(true /*topLevel*/) + subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm, underOpts)} + c.popRule() + return &syntax.Expr{Kind: syntax.List, Sub: subs, ListFlags: syntax.OneOrMore, Pos: c.allocatePos(underOpts, "plusQuantifier", -1 /*sym*/), Origin: p} case *ast.RhsStarQuantifier: - subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm)} - return &syntax.Expr{Kind: syntax.List, Sub: subs, Pos: c.allocatePos(), Origin: p} + c.pushRule(true /*topLevel*/) + subs := []*syntax.Expr{c.convertPart(p.Inner(), nonterm, underOpts)} + c.popRule() + return &syntax.Expr{Kind: syntax.List, Sub: subs, Pos: c.allocatePos(underOpts, "starQuantifier", -1 /*sym*/), Origin: p} case *ast.RhsSet: + c.pushRule(true /*topLevel*/) set := c.convertSet(p.Expr()) + c.popRule() index := len(c.out.Sets) c.out.Sets = append(c.out.Sets, set) - return &syntax.Expr{Kind: syntax.Set, Pos: c.allocatePos(), SetIndex: index, Origin: p, Model: c.out} + return &syntax.Expr{Kind: syntax.Set, Pos: c.allocatePos(underOpts, "set", -1 /*sym*/), SetIndex: index, Origin: p, Model: c.out} case *ast.RhsSymbol: sym, args := c.resolveRef(p.Reference(), nonterm) - c.pushName(p.Reference().Name().Text(), c.rhsPos) - return &syntax.Expr{Kind: syntax.Reference, Symbol: sym, Args: args, Pos: c.allocatePos(), Origin: p, Model: c.out} + pos := c.allocatePos(underOpts, "reference", sym) + c.pushName(p.Reference().Name().Text(), pos) + return &syntax.Expr{Kind: syntax.Reference, Symbol: sym, Args: args, Pos: pos, Origin: p, Model: c.out} case *ast.StateMarker: return &syntax.Expr{Kind: syntax.StateMarker, Name: p.Name().Text(), Origin: p} case *ast.SyntaxProblem: @@ -851,7 +911,7 @@ func (c *syntaxLoader) convertPart(p ast.RhsPart, nonterm *syntax.Nonterm) *synt return &syntax.Expr{Kind: syntax.Empty, Origin: p.TmNode()} } -func (c *syntaxLoader) convertSequence(parts []ast.RhsPart, nonterm *syntax.Nonterm, topLevel bool, origin status.SourceNode) *syntax.Expr { +func (c *syntaxLoader) convertSequence(parts []ast.RhsPart, nonterm *syntax.Nonterm, topLevel, underOpts bool, origin status.SourceNode) *syntax.Expr { var subs []*syntax.Expr var empty *ast.RhsEmpty var nonEmpty bool @@ -874,7 +934,7 @@ func (c *syntaxLoader) convertSequence(parts []ast.RhsPart, nonterm *syntax.Nont nonEmpty = true } - out := c.convertPart(p, nonterm) + out := c.convertPart(p, nonterm, underOpts) if out.Kind != syntax.Empty { subs = append(subs, out) } @@ -932,8 +992,14 @@ func (c *syntaxLoader) isSelector(name string) bool { return ok } -func (c *syntaxLoader) convertRules(rules []ast.Rule0, nonterm *syntax.Nonterm, defaultReport report, topLevel bool, origin status.SourceNode) *syntax.Expr { +func (c *syntaxLoader) convertRules(rules []ast.Rule0, nonterm *syntax.Nonterm, defaultReport report, topLevel, underOpts bool, origin status.SourceNode) *syntax.Expr { var subs []*syntax.Expr + + if !topLevel && len(rules) > 1 { + // This is a nested choice, e.g. the "(a | b)" in "start: (a | b) c". + underOpts = true + } + for _, rule0 := range rules { rule, ok := rule0.(*ast.Rule) if !ok { @@ -941,11 +1007,7 @@ func (c *syntaxLoader) convertRules(rules []ast.Rule0, nonterm *syntax.Nonterm, continue } - if topLevel { - // Counting of RHS symbols does not restart for inline alternatives. - c.rhsPos = 1 - c.rhsNames = nil - } + c.pushRule(topLevel) var prec *ast.RhsPrec for _, p := range rule.RhsPart() { switch p := p.(type) { @@ -958,7 +1020,7 @@ func (c *syntaxLoader) convertRules(rules []ast.Rule0, nonterm *syntax.Nonterm, } } - expr := c.convertSequence(rule.RhsPart(), nonterm, topLevel, rule) + expr := c.convertSequence(rule.RhsPart(), nonterm, topLevel, underOpts, rule) clause, _ := rule.ReportClause() expr = c.convertReportClause(clause).withDefault(defaultReport).apply(expr) if prec != nil && topLevel { @@ -977,6 +1039,7 @@ func (c *syntaxLoader) convertRules(rules []ast.Rule0, nonterm *syntax.Nonterm, } subs = append(subs, expr) + c.popRule() } switch len(subs) { case 0: @@ -994,7 +1057,7 @@ func (c *syntaxLoader) convertRules(rules []ast.Rule0, nonterm *syntax.Nonterm, func (c *syntaxLoader) load(p ast.ParserSection, header status.SourceNode) { c.out = new(syntax.Model) for _, sym := range c.resolver.Syms { - c.out.Terminals = append(c.out.Terminals, sym.ID) + c.out.Terminals = append(c.out.Terminals, syntax.Terminal{Name: sym.ID, Type: sym.Type}) } c.collectParams(p) nonterms := c.collectNonterms(p) @@ -1023,7 +1086,7 @@ func (c *syntaxLoader) load(p ast.ParserSection, header status.SourceNode) { c.Errorf(alias, "nonterminal aliases are not yet supported") } defaultReport := c.convertReportClause(clause) - expr := c.convertRules(nt.def.Rule0(), c.out.Nonterms[nt.nonterm], defaultReport, true /*topLevel*/, nt.def) + expr := c.convertRules(nt.def.Rule0(), c.out.Nonterms[nt.nonterm], defaultReport, true /*topLevel*/, false /*underOpts*/, nt.def) c.out.Nonterms[nt.nonterm].Value = or(c.out.Nonterms[nt.nonterm].Value, expr) } } @@ -1046,3 +1109,85 @@ func or(a, b *syntax.Expr) *syntax.Expr { } return &syntax.Expr{Kind: syntax.Choice, Sub: []*syntax.Expr{a, b}, Origin: b.Origin} } + +type rhsRule struct { + top *rhsRule // The top-level rule this rule is nested under. Nil if this is a top-level rule. + pos int // The next position to be allocated. Populated only for top-level rules. + names map[string]int // name -> position. Contains the names visible to the command of this rule. + argRefs []syntax.ArgRef // The argument references visible to the command of this rule. +} + +// nextPos returns the next position to be allocated w.r.t. the top-level rule. +func (r *rhsRule) nextPos() int { + if r.top == nil { + return r.pos + } + return r.top.pos +} + +func (r *rhsRule) incPos() { + if r.top == nil { + r.pos++ + } else { + r.top.pos++ + } +} + +func (r *rhsRule) isTopLevel() bool { + return r.top == nil +} + +func (c *syntaxLoader) pushRule(topLevel bool) { + var rule *rhsRule + if topLevel { + rule = &rhsRule{ + pos: 1, + names: make(map[string]int), + } + } else { + p := c.ruleStack[len(c.ruleStack)-1] + var top *rhsRule + if p.top == nil { + top = p + } else { + top = p.top + } + rule = &rhsRule{ + top: top, + names: make(map[string]int), + } + } + c.ruleStack = append(c.ruleStack, rule) +} + +func (c *syntaxLoader) currentRule() *rhsRule { + return c.ruleStack[len(c.ruleStack)-1] +} + +func (c *syntaxLoader) popRule() { + rule := c.ruleStack[len(c.ruleStack)-1] + c.ruleStack = c.ruleStack[:len(c.ruleStack)-1] + + if rule.top == nil { + return + } + + // This is a nested rule. Add the names and arg refs to the parent rule so that they are + // accessible to the command of the parent rule. + // + // For example, if we have: + // + // start: a ( b {cmd1} | c {cmd2} ) {cmd3} + // + // both "b" and "c" should be accessible by cmd3 as well. + p := c.ruleStack[len(c.ruleStack)-1] + if p.top != nil { + // The `names` field of a top-level rule is already populated by pushName(). + for name, pos := range rule.names { + p.names[name] = pos + } + } + for _, ref := range rule.argRefs { + p.argRefs = append(p.argRefs, ref) + } +} diff --git a/gen/funcs.go b/gen/funcs.go index 1d3f3d919..bdf64790f 100644 --- a/gen/funcs.go +++ b/gen/funcs.go @@ -231,6 +231,15 @@ func sub(a, b int) int { return a - b } +func indexToPos(i int, remap map[int]int) int { + for pos, idx := range remap { + if idx == i { + return pos + } + } + return 0 +} + func goParserAction(s string, args *grammar.ActionVars, origin status.SourceNode) (string, error) { var decls strings.Builder var sb strings.Builder @@ -254,18 +263,19 @@ func goParserAction(s string, args *grammar.ActionVars, origin status.SourceNode } var index int + var pos int switch id { case "left()", "leftRaw()": index = -2 case "first()": - if len(args.Types) == 0 { + if args.SymRefCount == 0 { index = -1 } case "last()": - if len(args.Types) == 0 { + if args.SymRefCount == 0 { index = -1 } else { - index = len(args.Types) - 1 + index = args.SymRefCount - 1 } default: if strings.HasPrefix(id, "self[") && strings.HasSuffix(id, "]") { @@ -275,11 +285,20 @@ func goParserAction(s string, args *grammar.ActionVars, origin status.SourceNode } } - var ok bool - index, ok = args.Resolve(id) + ref, ok := args.Resolve(id) if !ok { return "", status.Errorf(origin, "invalid reference %q", id) } + index = ref.Index + pos = ref.Pos + } + + // We are trying to locate the first or last symbol from RHS. + if pos == 0 && index >= 0 { + pos = indexToPos(index, args.Remap) + if pos == 0 { + return "", status.Errorf(origin, "internal error: cannot find the position for index %v", index) + } } if index == -1 { @@ -294,7 +313,7 @@ func goParserAction(s string, args *grammar.ActionVars, origin status.SourceNode if index == -2 { v = "lhs" } else { - v = fmt.Sprintf("stack[len(stack)-%v]", len(args.Types)-index) + v = fmt.Sprintf("stack[len(stack)-%v]", args.SymRefCount-index) } switch { case prop == "sym": @@ -302,10 +321,10 @@ func goParserAction(s string, args *grammar.ActionVars, origin status.SourceNode case prop == "value": v += ".value" switch { - case index >= 0 && args.Types[index] != "": + case index >= 0 && args.Types[pos] != "": varName := fmt.Sprintf("nn%v", index) if !seen[index] { - fmt.Fprintf(&decls, "%v, _ := %v.(%v)\n", varName, v, args.Types[index]) + fmt.Fprintf(&decls, "%v, _ := %v.(%v)\n", varName, v, args.Types[pos]) seen[index] = true } v = varName @@ -326,6 +345,16 @@ func goParserAction(s string, args *grammar.ActionVars, origin status.SourceNode return decls.String() + sb.String(), nil } +func ccWrapInOptional(argType, input string) string { + return fmt.Sprintf("std::optional<%v>(%v)", argType, input) +} + +// ccTypeFromUnion returns the type of the union field, without the last ID, e.g. the "int" in +// "int x". +func ccTypeFromUnion(unionField string) string { + return strings.TrimSpace(unionField[:len(unionField)-len(lastID(unionField))]) +} + func ccParserAction(s string, args *grammar.ActionVars, origin status.SourceNode, variantStackEntry bool) (ret string, err error) { defer func(s string) { if r := recover(); r != nil { @@ -349,74 +378,95 @@ func ccParserAction(s string, args *grammar.ActionVars, origin status.SourceNode } // Handle the rest of this '$' or '@' - var target, prop string + + // $$ --> lhs.value if s[0] == '$' { - // $$ --> lhs.value - target = "lhs" + var replacement string if ch == '@' { - prop = "sym.location" + replacement = "lhs.sym.location" } else { t := args.LHSType if t == "" { return "", status.Errorf(origin, "$$ cannot be used inside a nonterminal semantic action without a type") } if variantStackEntry { - prop = "std::get<" + t + ">(" + target + ".value)" - target = "" + replacement = "std::get<" + t + ">(lhs.value)" } else { - prop = "value." + lastID(t) + replacement = "lhs.value." + lastID(t) } } s = s[1:] - } else { - var d int - r, w := utf8.DecodeRuneInString(s) - for unicode.IsDigit(r) || unicode.IsLetter(r) || r == '_' { - d += w - r, w = utf8.DecodeRuneInString(s[d:]) + sb.WriteString(replacement) + continue + } + + // RHS symbol references, e.g. $1, @a. + var d int + r, w := utf8.DecodeRuneInString(s) + for unicode.IsDigit(r) || unicode.IsLetter(r) || r == '_' { + d += w + r, w = utf8.DecodeRuneInString(s[d:]) + } + if d == 0 { + return "", status.Errorf(origin, "%c should be followed by a number or identifier", ch) + } + val := s[:d] + s = s[d:] + + // cc uses 1-based indexing. + ref, ok := args.ResolveOneBased(val) + if !ok { + return "", status.Errorf(origin, "invalid reference %c%q", ch, val) + } + + index := ref.Index + pos := ref.Pos + + argType := args.Types[pos] + + // The symbol reference is valid in the original rule but is not present in the expanded + // rule, so it references an optional symbol either expanded from a Choice or an Optional. + if index == -1 { + // Use std::optional() as the semantic value for the non-present symbol. + if ch == '@' { + sb.WriteString(ccWrapInOptional("decltype(lhs.sym.location)", "")) + continue } - if d == 0 { - return "", status.Errorf(origin, "%c should be followed by a number or identifier", ch) + + if argType == "" { + return "", status.Errorf(origin, "symbol %c%q does not have an associated type", ch, val) } - val := s[:d] - s = s[d:] - var index int - if pos, err := strconv.Atoi(val); err == nil { - if pos < 1 || pos >= args.CmdArgs.MaxPos { - // Index out of range. - return "", status.Errorf(origin, "out of bounds reference %c%v [max = %v]", ch, val, args.CmdArgs.MaxPos) - } - index = pos - 1 - } else { - // Resolve by name - var ok bool - index, ok = args.Resolve(val) - if !ok { - return "", status.Errorf(origin, "invalid reference %c%q", ch, val) - } + if !variantStackEntry { + argType = ccTypeFromUnion(argType) } + sb.WriteString(ccWrapInOptional(argType, "")) + continue + } - target = fmt.Sprintf("rhs[%v]", index+args.Delta) - if ch == '@' { - prop = "sym.location" + // The referenced symbol is present in the expanded rule. + var replacement string + target := fmt.Sprintf("rhs[%v]", index+args.Delta) + if ch == '@' { + replacement = target + ".sym.location" + argType = "decltype(lhs.sym.location)" + } else { + if argType == "" { + return "", status.Errorf(origin, "%c%q does not have an associated type", ch, val) + } + if variantStackEntry { + replacement = "std::get<" + argType + ">(" + target + ".value)" } else { - t := args.Types[index] - if t == "" { - return "", status.Errorf(origin, "%c%q does not have an associated type", ch, val) - } - if variantStackEntry { - prop = "std::get<" + t + ">(" + target + ".value)" - target = "" - } else { - prop = "value." + lastID(t) - } + replacement = target + ".value." + lastID(argType) + argType = ccTypeFromUnion(argType) } } - if len(target) > 0 { - sb.WriteString(target) - sb.WriteByte('.') + + if argRef := args.ArgRefs[pos]; argRef.Optional { + // This symbol reference is optional in the original rule, so we wrap it inside a + // std::optional to unify the semantic actions for the expanded rules. + replacement = ccWrapInOptional(argType, replacement) } - sb.WriteString(prop) + sb.WriteString(replacement) } return sb.String(), nil } @@ -442,18 +492,19 @@ func bisonParserAction(s string, args *grammar.ActionVars, origin status.SourceN } var index int + var pos int switch id { case "left()", "leftRaw()": index = -2 case "first()": - if len(args.Types) == 0 { + if args.SymRefCount == 0 { index = -1 } case "last()": - if len(args.Types) == 0 { + if args.SymRefCount == 0 { index = -1 } else { - index = len(args.Types) - 1 + index = args.SymRefCount - 1 } default: if strings.HasPrefix(id, "self[") && strings.HasSuffix(id, "]") { @@ -463,11 +514,19 @@ func bisonParserAction(s string, args *grammar.ActionVars, origin status.SourceN } } - var ok bool - index, ok = args.Resolve(id) + ref, ok := args.Resolve(id) if !ok { return "", status.Errorf(origin, "invalid reference %q", id) } + index = ref.Index + pos = ref.Pos + } + + if pos == 0 && index >= 0 { + pos = indexToPos(index, args.Remap) + if pos == 0 { + return "", status.Errorf(origin, "internal error: cannot find the position for index %v", index) + } } if index == -1 { @@ -484,9 +543,9 @@ func bisonParserAction(s string, args *grammar.ActionVars, origin status.SourceN case index < 0 && args.LHSType != "" && id != "leftRaw()": needsParen = true fmt.Fprintf(&sb, "(/*%v*/", args.LHSType) - case index >= 0 && args.Types[index] != "": + case index >= 0 && args.Types[pos] != "": needsParen = true - fmt.Fprintf(&sb, "(/*%v*/", args.Types[index]) + fmt.Fprintf(&sb, "(/*%v*/", args.Types[pos]) } sb.WriteByte('$') } else { diff --git a/gen/funcs_test.go b/gen/funcs_test.go index 3d1e916a4..6e7104d7f 100644 --- a/gen/funcs_test.go +++ b/gen/funcs_test.go @@ -165,7 +165,8 @@ func TestParserAction(t *testing.T) { {"$a + ${last()}", vars("a:0", "b", "c:1", "d"), "stack[len(stack)-2].value + stack[len(stack)-1].value"}, {"${first()} + ${left()}", vars("a:0", "b", "c:1", "d"), "stack[len(stack)-2].value + lhs.value"}, - {"${first()} + ${left()}", vars("a:1:bar", "b", "c", "d"), "nn0, _ := stack[len(stack)-1].value.(bar)\nnn0 + lhs.value"}, + {"${first()} + ${left()}", vars("a:0:bar", "b", "c", "d"), "nn0, _ := stack[len(stack)-1].value.(bar)\nnn0 + lhs.value"}, + {"${first()} + ${left()} + $a", vars("a:0:bar", "b", "c", "d"), "nn0, _ := stack[len(stack)-1].value.(bar)\nnn0 + lhs.value + nn0"}, {"${left().sym}", vars("a:0", "b", "c:1", "d:2"), "(&lhs.sym)"}, {"${left().offset}", vars("a:0", "b", "c:1", "d:2"), "lhs.sym.offset"}, @@ -194,10 +195,11 @@ func TestCcParserAction(t *testing.T) { want string useVariant bool }{ - {"abc", varsOneBased(), "abc", false}, - {"$$ = $1", varsOneBased("%node", "a:0:expr"), "lhs.value.node = rhs[0].value.expr", false}, - {"$$ = @$ @1", varsOneBased("%node", "a:0:expr"), "lhs.value.node = lhs.sym.location rhs[0].sym.location", false}, - {"$$ = $1", varsOneBased("%node", "a:0:expr"), "std::get(lhs.value) = std::get(rhs[0].value)", true}, + {"abc", vars(), "abc", false}, + // The 1-based index for "a" is 2. + {"$$ = $2", vars("%node", "a:0:expr"), "lhs.value.node = rhs[0].value.expr", false}, + {"$$ = @$ @2", vars("%node", "a:0:expr"), "lhs.value.node = lhs.sym.location rhs[0].sym.location", false}, + {"$$ = $2", vars("%node", "a:0:expr"), "std::get(lhs.value) = std::get(rhs[0].value)", true}, } for _, tc := range tests { @@ -212,45 +214,43 @@ func TestCcParserAction(t *testing.T) { } } -func varsOneBased(list ...string) *grammar.ActionVars { - return varsWithOffset(false, list...) -} - func vars(list ...string) *grammar.ActionVars { - return varsWithOffset(true, list...) -} - -func varsWithOffset(zeroBased bool, list ...string) *grammar.ActionVars { ret := &grammar.ActionVars{ CmdArgs: syntax.CmdArgs{ - MaxPos: 1 + len(list), - Names: make(map[string]int), + MaxPos: 1 + len(list), + Names: make(map[string]int), + ArgRefs: make(map[int]syntax.ArgRef), }, Remap: make(map[int]int), + Types: make(map[int]string), } for i, descr := range list { + pos := i + 1 + ret.CmdArgs.ArgRefs[pos] = syntax.ArgRef{Pos: pos} + ret.Types[pos] = "" + if strings.HasPrefix(descr, "%") { ret.LHSType = descr[1:] continue } name, num, mapped := strings.Cut(descr, ":") if name != "" { - ret.Names[name] = i + ret.Names[name] = pos + ret.ArgRefs[pos] = syntax.ArgRef{ + Pos: pos, + } } if !mapped { continue } + ret.SymRefCount++ num, tp, _ := strings.Cut(num, ":") - target, err := strconv.Atoi(num) + index, err := strconv.Atoi(num) if err != nil { log.Fatalf("cannot parse %q as a number in %q", num, descr) } - ret.Types = append(ret.Types, tp) - index := i - if !zeroBased { - index++ - } - ret.Remap[index] = target + ret.Types[pos] = tp + ret.Remap[pos] = index } return ret } diff --git a/gen/templates/cc_parser_cc.go.tmpl b/gen/templates/cc_parser_cc.go.tmpl index 3099a3bc5..880bbb8a5 100644 --- a/gen/templates/cc_parser_cc.go.tmpl +++ b/gen/templates/cc_parser_cc.go.tmpl @@ -721,8 +721,8 @@ absl::Status Parser::action{{$index}}([[maybe_unused]] stackEntry& lhs, {{ end -}} {{ end -}} -absl::Status Parser::applyRule(int32_t rule, stackEntry& lhs, - [[maybe_unused]] const stackEntry* rhs, +absl::Status Parser::applyRule(int32_t rule, int32_t ruleLen, stackEntry& lhs, + [[maybe_unused]] stackEntry* rhs, Lexer& lexer) { {{ if or .Parser.HasActions .Parser.Tables.Lookaheads -}} switch (rule) { @@ -768,6 +768,13 @@ absl::Status Parser::applyRule(int32_t rule, stackEntry& lhs, return absl::OkStatus(); {{ end -}} default: +{{ if .Parser.HasAssocValues -}} + if (ruleLen > 0) { + // If no semantic action is provided, and the rhs is not empty, we use the + // value of the first symbol on the RHS as the value of the lhs. + lhs.value = std::move(rhs[0].value); + } +{{ end -}} break; } {{ end -}} @@ -826,6 +833,12 @@ absl::Status Parser::Parse(int{{$stateType}}_t start, int{{$stateType}}_t end, end_state_ = end; {{- end}} fetchNext(lexer, stack); + // The location in this stackEntry will be used for any leading non-terminal + // symbols satsified by %empty, so it needs to be initialized. We initialize + // it to the start location of the first token. + stack.back().sym.location = + Lexer::Location(lexer.LastTokenLocation(){{template "locStart"}}, + lexer.LastTokenLocation(){{template "locStart"}}); while (state != end) { int32_t action = tmAction[state]; @@ -860,22 +873,16 @@ absl::Status Parser::Parse(int{{$stateType}}_t start, int{{$stateType}}_t end, int32_t ln = tmRuleLen[rule]; stackEntry entry; entry.sym.symbol = tmRuleSymbol[rule]; - const stackEntry* rhs = &stack[0] + stack.size() - ln; + stackEntry* rhs = &stack[0] + stack.size() - ln; if (ln == 0) { entry.sym.location = Lexer::Location(stack.back().sym.location{{template "locEnd"}}, stack.back().sym.location{{template "locEnd"}}); -{{ if .Parser.HasAssocValues -}} - entry.value = stack.back().value; -{{ end -}} } else { entry.sym.location = {{template "CreateLocationFromRHS" . -}}(ln, [&](int32_t i) { return rhs[i].sym.location; }); -{{ if .Parser.HasAssocValues -}} - entry.value = rhs[0].value; -{{ end -}} } - absl::Status ret = applyRule(rule, entry, rhs, lexer{{if .NeedsSession}}, &s{{end}}); + absl::Status ret = applyRule(rule, ln, entry, rhs, lexer{{if .NeedsSession}}, &s{{end}}); if (!ret.ok()) { return ret; } @@ -1006,4 +1013,4 @@ absl::Status Parser::Parse(int{{$stateType}}_t start, int{{$stateType}}_t end, {{ else -}} {{ template "customReportNext" . -}} {{ end -}} -{{ end -}} +{{ end -}} \ No newline at end of file diff --git a/gen/templates/cc_parser_h.go.tmpl b/gen/templates/cc_parser_h.go.tmpl index dbc3dafdc..a7e279bbb 100644 --- a/gen/templates/cc_parser_h.go.tmpl +++ b/gen/templates/cc_parser_h.go.tmpl @@ -32,11 +32,22 @@ struct symbol { {{end -}} {{ block "stackEntry" . -}} +{{ if .Options.VariantStackEntry -}} +{{ range .Parser.UnionFields -}} +static_assert(std::is_default_constructible_v<{{.}}>, + "Symbol associated value type {{.}} is not default constructible."); +{{ end -}} +{{ end -}} + {{$stateType := bits_per_element .Parser.Tables.FromTo -}} struct stackEntry { symbol sym; int{{$stateType}}_t state = 0; -{{ if .Parser.HasAssocValues -}} +{{ if .UnionDefinition -}} + union +{{ .UnionDefinition -}} + value; +{{ else if .Parser.HasAssocValues -}} {{ if .Options.VariantStackEntry -}} std::variant< {{ range .Parser.UnionFields -}} @@ -162,8 +173,8 @@ class Parser final { {{ end -}} {{ end -}} - absl::Status applyRule(int32_t rule, stackEntry& lhs, - [[maybe_unused]] const stackEntry* rhs, + absl::Status applyRule(int32_t rule, int32_t ruleLen, stackEntry& lhs, + [[maybe_unused]] stackEntry* rhs, Lexer& lexer); absl::Status Parse( int{{$stateType}}_t start, int{{$stateType}}_t end, Lexer& lexer); @@ -199,4 +210,4 @@ class Parser final { } // namespace {{.Options.Namespace}} -#endif // {{.Options.IncludeGuardPrefix}}PARSER_H_ +#endif // {{.Options.IncludeGuardPrefix}}PARSER_H_ \ No newline at end of file diff --git a/grammar/grammar.go b/grammar/grammar.go index 9a0ae2092..94ce34c9d 100644 --- a/grammar/grammar.go +++ b/grammar/grammar.go @@ -53,6 +53,7 @@ type Grammar struct { Parser *Parser CustomTemplates string + UnionDefinition string } // Range marks the portion of a rule that needs to be reported. @@ -78,34 +79,82 @@ type SemanticAction struct { type ActionVars struct { syntax.CmdArgs - // Types of the references of the rule. - Types []string + // position -> type of the references of the original rule. + // + // Note: types are indexed by position rather than index to support getting types of references + // that are not present in the current expansion of the rule. + Types map[int]string LHSType string // Not every symbol reference is present in the desugared rule. Remap map[int]int + + // Number of RHS symbols in the expanded rule. + SymRefCount int +} + +// Reference is a symbol reference in a semantic action. +type Reference struct { + // Position of the reference in the original rule. 1-based. Used to identify the symbol in + // semantic actions code blocks. + Pos int + + // Index of the symbol in the expanded rule. 0 based. Used to identify the symbol in the TM + // compiler. + // + // -1 means that the reference is present in the original rule but not in this expanded rule. + Index int +} + +// Resolve resolves the symbol reference `val` to an RHS index (0-based). `val` can either be a +// 0-based index (e.g. "0" in "$0") or a named symbol (e.g. "a" in "$a"). +// +// Returns 0 if `val` is not a valid symbol in the original rule, e.g. using "$a" in "start: b". +// +// Returns -1 if `val` is a valid symbol in the original rule but does not show up in the +// expanded rule. For example, a: b? expands into two rules: +// +// a: b +// | %empty +// +// For the %empty rule, Resolve("b") returns -1. +func (a *ActionVars) Resolve(val string) (Reference, bool) { + return a.resolve(val /*zeroBased=*/, true) } -// Resolve resolves "val" to an RHS index for the current rule. -func (a *ActionVars) Resolve(val string) (int, bool) { - pos, ok := a.CmdArgs.Names[val] - if !ok { - var err error - pos, err = strconv.Atoi(val) - if err != nil { - return 0, false +// ResolveOneBased is similar to Resolve, except that `val` is 1-based if it is a number. +func (a *ActionVars) ResolveOneBased(val string) (Reference, bool) { + return a.resolve(val /*zeroBased=*/, false) +} + +func (a *ActionVars) resolve(val string, zeroBased bool) (Reference, bool) { + // `pos` is always 1-based. + pos, err := strconv.Atoi(val) + if err == nil { + // The input "val" is a number reference, e.g. $1. + if zeroBased { + // The input reference starts from 0, e.g. $0 references the first symbol. Change it to + // 1-based. + pos++ } - pos++ // "val" is 0-based, while positions are 1-based. if pos < 1 || pos >= a.CmdArgs.MaxPos { // Index out of range. - return 0, false + return Reference{}, false + } + } else { + // The input "val" is a named symbol reference, e.g. $a. + var exists bool + pos, exists = a.CmdArgs.Names[val] + if !exists { + // No such a symbol exists in the original rule. + return Reference{}, false } } - ret, ok := a.Remap[pos] - if !ok { - ret = -1 + idx, exists := a.Remap[pos] + if !exists { + idx = -1 } - return ret, true + return Reference{Index: idx, Pos: pos}, true } // String is used as a digest of a semantic action environment (and also as a debug string). @@ -201,6 +250,8 @@ type Options struct { MaxLookahead int // If set, all lookaheads expressions will be validated to fit this limit. OptInstantiationSuffix string // Suffix that triggers auto-instantiation optional nonterminals (e.g. "opt" or "_opt"). + DisableSyntax []string // Lists grammar syntaxes that should be disabled. + // AST generation. Go-specific for now. TokenStream bool EventBased bool @@ -224,4 +275,4 @@ type Options struct { DirIncludePrefix string // for generated headers ParseParams []string // parser fields initialized in the constructor VariantStackEntry bool // whether to generate a std::variant stackEntry rather than a union. Default false. -} +} \ No newline at end of file diff --git a/syntax/expand.go b/syntax/expand.go index d26f3587e..d9e07d89f 100644 --- a/syntax/expand.go +++ b/syntax/expand.go @@ -10,6 +10,111 @@ import ( "github.com/inspirer/textmapper/util/ident" ) +// updateArgRefs updates the ArgRefs of `e` to include the new nonterminals in `newNts`. +// +// When `e.ArgRefs` was created, we did not have the non-terminals that TextMapper creates for +// Lists yet. We fill in the missing non-terminals once they are created by calling this function. +func updateArgRefs(m *Model, newNts map[int]int, e *Expr) { + if cmdArgs := e.CmdArgs; cmdArgs != nil { + for pos, sym := range newNts { + copied, exists := cmdArgs.ArgRefs[pos] + if !exists { + // The ArgRefs of mid rules do not the terminals after it. + continue + } + copied.Symbol = sym + cmdArgs.ArgRefs[pos] = copied + } + return + } + for _, sub := range e.Sub { + updateArgRefs(m, newNts, sub) + } +} + +// ExpandOptions contains the options for the Expand function. +type ExpandOptions struct { + // OptionalType returns the type of an optional symbol s? or s_opt, where `t` is the type of the + // symbol s. + OptionalType func(t string) string + + // OptionalCmd returns the command to calculate the semantic value of an optional symbol s_opt, + // where `t` is the type of the symbol s. + OptionalCmd func(t string) string + + // ListType returns the type of the list symbol s* or s+, where `t` is the type of the element + // symbol s. + ListType func(t string) string + + // NewList returns the command to create a new list of the given type. + NewList func(elemType string, elemPos int, listFlags ListFlags) string + + // Append returns the command to append an element to a list. + Append func(elemPos, listPos int, listFlags ListFlags) string + + // DefaultValue returns the default value of the given type `t`. + DefaultValue func(t string) string +} + +// CcExpandOptions returns the ExpandOptions for generating C++ semantic actions. +func CcExpandOptions() *ExpandOptions { + return &ExpandOptions{ + OptionalType: func(t string) string { + if t == "" { + return "" + } + return "std::optional<" + t + ">" + }, + OptionalCmd: func(t string) string { + // For cc the semantic action does not need the input type `t`. + // + // TODO: This involves copying the rhs value when constructing the std::optional. Example + // generated code: + // + // ```cc + // lhs.value = std::optional(std::get(rhs[0].value)); + // ``` + // + // If this turns out to be a performance bottleneck, we should find a way to use move when + // constructing the std::optional. + return fmt.Sprintf(`{ $$ = $1; }`) + }, + ListType: func(t string) string { + if t == "" { + return "" + } + return "std::vector<" + t + ">" + }, + NewList: func(elemType string, elemPos int, listFlags ListFlags) string { + if listFlags&OneOrMore != 0 { + return fmt.Sprintf(` + auto& elem = $%v; + auto& mutable_elem = const_cast::type>::type&>(elem); + $$ = std::vector<%v>{std::move(mutable_elem)}; + `, elemPos, elemType) + } + return fmt.Sprintf(`$$ = std::vector<%v>{};`, elemType) + }, + Append: func(elemPos, listPos int, listFlags ListFlags) string { + return fmt.Sprintf(`{ + auto& list = $%v; + auto& elem = $%v; + auto& mutable_list = const_cast::type>::type&>(list); + auto& mutable_elem = const_cast::type>::type&>(elem); + auto new_list = std::move(mutable_list); + new_list.push_back(std::move(mutable_elem)); + $$ = std::move(new_list); + }`, listPos, elemPos) + }, + DefaultValue: func(t string) string { + if t == "" { + return "" + } + return t + "{}" + }, + } +} + // Expand rewrites the grammar substituting extended notation clauses with equivalent // context-free production forms. Every nonterminal becomes a choice of sequences (production // rules), where each sequence can contain only StateMarker, Command, or Reference expressions. @@ -24,12 +129,13 @@ import ( // Note: for now it leaves Assign, Append, and Arrow expressions untouched. The first two can // contain references only. Arrow can contain a sub-sequence if it reports more than one // symbol reference. -func Expand(m *Model) error { +func Expand(m *Model, opts *ExpandOptions) error { e := &expander{ Model: m, m: make(map[string]int), perm: make([]int, len(m.Nonterms)), reuse: make([]int, 0, 16), + opts: opts, } max := len(m.Nonterms) for i, nt := range m.Nonterms { @@ -68,23 +174,33 @@ func Expand(m *Model) error { for self, nt := range m.Nonterms { switch nt.Value.Kind { case Optional: - // Note: this case facilitates 0..* lists extraction. + // Note: this case facilitates 0..* lists extraction. All other optionals are handled by + // expandRule. + if nt.Value.Sub[0].Kind != Reference { + return status.Errorf(nt.Value.Origin, "internal error: expecting an optional reference, but got %+v", nt.Value.Sub[0]) + } + symbolType := getSymbolType(nt.Value.Sub[0], m) + subs := []*Expr{nt.Value.Sub[0], &Expr{Kind: Empty, Origin: nt.Value.Origin}} + // For the %empty rule, use an empty list as the semantic value. + if e.opts.DefaultValue != nil && symbolType != "" { + defaultVal := e.opts.DefaultValue(symbolType) + subs[1] = &Expr{Kind: Command, Name: "$$ = " + defaultVal + ";", Origin: nt.Value.Origin, CmdArgs: &CmdArgs{MaxPos: 1}} + } nt.Value = &Expr{ - Kind: Choice, - Sub: []*Expr{ - nt.Value.Sub[0], - {Kind: Empty, Origin: nt.Value.Origin}, - }, + Kind: Choice, + Sub: subs, Origin: nt.Value.Origin, } case List: // Note: at this point all lists either have at least one element or have no separators. - rr := nt.Value.ListFlags&RightRecursive != 0 - nonEmpty := nt.Value.ListFlags&OneOrMore != 0 + listFlags := nt.Value.ListFlags + rr := listFlags&RightRecursive != 0 + nonEmpty := listFlags&OneOrMore != 0 elem := nt.Value.Sub[0] origin := nt.Value.Origin rec := &Expr{Kind: Sequence, Origin: origin} - rec.Sub = append(rec.Sub, &Expr{Kind: Reference, Symbol: len(m.Terminals) + self, Model: m, Origin: origin}) + listRef := &Expr{Kind: Reference, Symbol: len(m.Terminals) + self, Model: m, Origin: origin} + rec.Sub = append(rec.Sub, listRef) if len(nt.Value.Sub) > 1 { if rr { rec = concat(origin, nt.Value.Sub[1], rec) @@ -96,7 +212,65 @@ func Expand(m *Model) error { Kind: Choice, Origin: origin, } - if elem.Kind == Choice { + // Automatic value propagation works for lists of references only (with and without + // separators). In every other sense this branch repeats the next one. + if elem.Kind == Reference { + // Add the recursion rule, e.g. `a_list: a_list a`. + var recursion []*Expr + if rr { + recursion = append(recursion, elem, rec) + } else { + recursion = append(recursion, rec, elem) + } + elemType := getSymbolType(elem, m) + if opts.Append != nil && elemType != "" { + // Assign a new Pos for the list reference itself so that its semantic value can be + // referenced. + // + // The position of the list reference only needs to be different from the element Pos, + // instead of having to match the order between the listRef and the elem. For example, + // consider the following rule: + // + // start: a+ {...} + // + // `elem.Pos` is 1. Assuming we generate left-recursion rules for a_list: + // + // a_list: a_list a + // + // listRef.Pos is 2 (elem.Pos + 1), even though the listRef "a_list" actually appears + // before the "a". This is ok because Pos is only used to identify the symbols (and thus + // only needs to be unique), and the only semantic action that uses `listRef.Pos` is + // generated by `opts.Append`, which accepts both elemPos and listPos as arguments. + listPos := elem.Pos + 1 + listRef.Pos = listPos + argRefs := map[int]ArgRef{ + elem.Pos: ArgRef{Pos: elem.Pos, Symbol: elem.Symbol}, + listPos: ArgRef{Pos: listPos, Symbol: listRef.Symbol}, + } + code := opts.Append(elem.Pos, listPos, listFlags) + cmdArgs := &CmdArgs{MaxPos: listPos + 1, ArgRefs: argRefs} + recursion = append(recursion, &Expr{Kind: Command, Name: code, Origin: origin, CmdArgs: cmdArgs}) + } + nt.Value.Sub = append(nt.Value.Sub, concat(origin, recursion...)) + + // Add the base rule, e.g. `a_list: a`. + var base []*Expr + switch { + case nonEmpty: + base = append(base, elem) + if opts.NewList != nil && elemType != "" { + argRefs := map[int]ArgRef{ + elem.Pos: ArgRef{Pos: elem.Pos, Symbol: elem.Symbol}, + } + base = append(base, &Expr{Kind: Command, Name: opts.NewList(elemType, elem.Pos, listFlags), Origin: origin, CmdArgs: &CmdArgs{MaxPos: elem.Pos + 1, ArgRefs: argRefs}}) + } + case opts.NewList != nil && elemType != "": + base = append(base, &Expr{Kind: Command, Name: opts.NewList(elemType, elem.Pos, listFlags), Origin: origin, CmdArgs: &CmdArgs{MaxPos: elem.Pos + 1}}) + default: + base = append(base, &Expr{Kind: Empty, Origin: origin}) + } + nt.Value.Sub = append(nt.Value.Sub, concat(origin, base...)) + } else if elem.Kind == Choice { if rr { nt.Value.Sub = append(nt.Value.Sub, multiConcat(origin, elem.Sub, []*Expr{rec})...) } else { @@ -135,6 +309,10 @@ type expander struct { start int // nonterminal, for sorting base int reuse []int + + createdNts map[int]int // The non-terminals created the current rule. Position -> Symbol + + opts *ExpandOptions // Target-language-specific options during expansion. } func (e *expander) sortTail() { @@ -163,7 +341,7 @@ func (e *expander) sortTail() { e.reuse = local // return for reuse } -func (e *expander) extractNonterm(expr *Expr) *Expr { +func (e *expander) extractNonterm(expr *Expr, nonTermType string) *Expr { name := ProvisionalName(expr, e.Model) if existing, ok := e.m[name]; ok && expr.Equal(e.Nonterms[existing].Value) { sym := len(e.Terminals) + existing @@ -191,6 +369,7 @@ func (e *expander) extractNonterm(expr *Expr) *Expr { Name: name, Value: expr, Origin: expr.Origin, + Type: nonTermType, } e.Nonterms = append(e.Nonterms, nt) e.extra++ @@ -198,7 +377,14 @@ func (e *expander) extractNonterm(expr *Expr) *Expr { return &Expr{Kind: Reference, Symbol: sym, Model: e.Model, Origin: expr.Origin} } -func (e *expander) expandRule(rule *Expr) []*Expr { +func (e *expander) expandRule(rule *Expr) (expanded []*Expr) { + e.createdNts = make(map[int]int) + defer func() { + for _, rule := range expanded { + updateArgRefs(e.Model, e.createdNts, rule) + } + }() + if rule.Kind == Prec { ret := e.expandExpr(rule.Sub[0]) for i, val := range ret { @@ -250,8 +436,11 @@ func (e *expander) expandExpr(expr *Expr) []*Expr { } return ret case Set, Lookahead: - ret := e.extractNonterm(expr) + ret := e.extractNonterm(expr, "" /*nonTermType*/) ret.Pos = expr.Pos + if expr.Kind == Set { + e.createdNts[ret.Pos] = ret.Symbol + } return []*Expr{ret} case List: out := &Expr{Kind: List, Origin: expr.Origin, ListFlags: expr.ListFlags} @@ -268,11 +457,34 @@ func (e *expander) expandExpr(expr *Expr) []*Expr { out.Sub = append(out.Sub, sep[0]) out.ListFlags |= OneOrMore } - ret := e.extractNonterm(out) + var listType string + // Calculate the list type for list of references. More complex structures, e.g. + // (a b)*, (a? b)+, (a?)* do not propagate the type automatically. + if expr.Sub[0].Kind == Reference { + elemType := getSymbolType(expr.Sub[0], e.Model) + if e.opts.ListType != nil { + listType = e.opts.ListType(elemType) + } + } + ret := e.extractNonterm(out, listType) if expr.ListFlags&OneOrMore == 0 && out.ListFlags&OneOrMore != 0 { - ret = e.extractNonterm(&Expr{Kind: Optional, Sub: []*Expr{ret}, Origin: expr.Origin}) + // List structs like "(a separator ',')*"" generates the following two non-terminals: + // + // (1) a_separator_comma_listopt: a_separator_comma_list | %empty + // (2) a_separator_comma_list: a_separator_comma_list ',' a | a + // + // We assign `listType` to "a_separator_comma_listopt" instead of using + // `e.opts.OptionalType(listType)` so that empty lists share the same type, e.g. + // + // list_string {std::string} = (a separator ',')*[a_list] { + // // $a_list will be of type std::vector. For the %empty case the list will + // // be empty instead of std::optional>. + // $$ = absl::StrJoin($a_list, ", "); + // } + ret = e.extractNonterm(&Expr{Kind: Optional, Sub: []*Expr{ret}, Origin: expr.Origin}, listType) } ret.Pos = expr.Pos + e.createdNts[ret.Pos] = ret.Symbol return []*Expr{ret} } return []*Expr{expr} @@ -335,7 +547,7 @@ func ProvisionalName(expr *Expr, m *Model) string { switch expr.Kind { case Reference: if expr.Symbol < len(m.Terminals) { - return ident.Produce(m.Terminals[expr.Symbol], ident.CamelCase) + return ident.Produce(m.Terminals[expr.Symbol].Name, ident.CamelCase) } return m.Nonterms[expr.Symbol-len(m.Terminals)].Name case Optional: @@ -435,3 +647,10 @@ func appendSetName(ts *TokenSet, m *Model, out *strings.Builder) { log.Fatalf("cannot compute name for TokenSet Kind=%v", ts.Kind) } } + +func getSymbolType(expr *Expr, m *Model) string { + if expr.Symbol < len(m.Terminals) { + return m.Terminals[expr.Symbol].Type + } + return m.Nonterms[expr.Symbol-len(m.Terminals)].Type +} diff --git a/syntax/set_test.go b/syntax/set_test.go index 40b06410f..35b8d4b4e 100644 --- a/syntax/set_test.go +++ b/syntax/set_test.go @@ -99,7 +99,7 @@ func TestSets(t *testing.T) { t.Errorf("cannot parse %q: %v", tc.input, err) continue } - err = syntax.Expand(model) + err = syntax.Expand(model, &syntax.ExpandOptions{}) if err != nil { t.Errorf("cannot expand %q: %v", tc.input, err) continue diff --git a/syntax/syntax.go b/syntax/syntax.go index 1e80e7073..f5e0fd031 100644 --- a/syntax/syntax.go +++ b/syntax/syntax.go @@ -11,9 +11,31 @@ import ( "github.com/inspirer/textmapper/status" ) +// Terminal is a terminal symbol used in a grammar. +type Terminal struct { + Name string + Type string +} + +func (t *Terminal) String() string { + return t.Name + "(type = " + t.Type + ")" +} + +// ArgRef represents a reference to a symbol in semantic actions. +type ArgRef struct { + Pos int // The positional index of the symbol in the original rule. 1-based. Used for resolving the number references in semantic actions. + Optional bool // Whether the symbol reference is under an Optional or a Nested Choice. + Kind string // The kind of the symbol, e.g. reference, starQuantifier, etc. Used for debugging. + Symbol int // The symbol index in the grammar. +} + +func (p *ArgRef) String() string { + return fmt.Sprintf("%+v", *p) +} + // Model is a model of a language's syntax built on top of a set of terminals. type Model struct { - Terminals []string + Terminals []Terminal Params []Param Nonterms []*Nonterm // all params and nonterms must have distinct names Inputs []Input @@ -24,7 +46,7 @@ type Model struct { // Ref returns the string version of a symbol reference for debugging. func (m *Model) Ref(sym int, args []Arg) string { if sym < len(m.Terminals) { - return m.Terminals[sym] + return m.Terminals[sym].Name } nt := m.Nonterms[sym-len(m.Terminals)] if len(args) == 0 { @@ -82,6 +104,17 @@ func (m *Model) Rearrange(perm []int) { expr.Symbol = terms + perm[nt] } }) + m.ForEach(Command, func(_ *Nonterm, expr *Expr) { + if expr.CmdArgs == nil || expr.CmdArgs.ArgRefs == nil { + return + } + for pos, argRef := range expr.CmdArgs.ArgRefs { + if nt := argRef.Symbol - terms; nt >= 0 { + argRef.Symbol = terms + perm[nt] + expr.CmdArgs.ArgRefs[pos] = argRef + } + } + }) for _, set := range m.Sets { set.ForEach(func(ts *TokenSet) { if nt := ts.Symbol - terms; nt >= 0 { @@ -150,7 +183,7 @@ type Expr struct { Sub []*Expr Symbol int Args []Arg - Pos int // Positional index of a reference, set, or list in the original rule. + Pos int // Positional index of a reference, set, or list in the original rule. 1-based. Predicate *Predicate ListFlags ListFlags ArrowFlags []string @@ -160,6 +193,26 @@ type Expr struct { Model *Model // Kept for some kinds for debugging. TODO error-prone, get rid of } +// ForEach visits all the Exprs of a given kind under `expr`. If `kind` is -1, all Expr kinds are +// visited. +func (e *Expr) ForEach(kind ExprKind, consumer func(e *Expr)) { + seen := make(map[*Expr]bool) + var visit func(e *Expr) + visit = func(e *Expr) { + if seen[e] { + return + } + seen[e] = true + if e.Kind == kind || kind == -1 { + consumer(e) + } + for _, sub := range e.Sub { + visit(sub) + } + } + visit(e) +} + // Equal returns true for equivalent grammar clauses. func (e *Expr) Equal(oth *Expr) bool { if e.Kind != oth.Kind { @@ -222,7 +275,7 @@ func (e *Expr) String() string { case Prec: var sym string if e.Model != nil { - sym = e.Model.Terminals[e.Symbol] + sym = e.Model.Terminals[e.Symbol].Name } else { sym = strconv.Itoa(e.Symbol) } @@ -379,9 +432,10 @@ func (k ExprKind) GoString() string { // CmdArgs defines which RHS symbols are available inside a semantic action. type CmdArgs struct { - Names map[string]int - MaxPos int // exclusive, 1-based - Delta int // Added to the final position to adjust for extracted middle rule actions. + Names map[string]int // alias -> position + MaxPos int // exclusive, 1-based + Delta int // Added to the final position to adjust for extracted middle rule actions. + ArgRefs map[int]ArgRef // position -> ArgRef } // TokenSet is a grammar expression that resolves to a set of tokens. diff --git a/syntax/syntax_test.go b/syntax/syntax_test.go index ce3893ea5..51fde6856 100644 --- a/syntax/syntax_test.go +++ b/syntax/syntax_test.go @@ -130,14 +130,14 @@ var parserTests = []struct { want *syntax.Model }{ {`A: a; B:;`, &syntax.Model{ - Terminals: []string{"EOI", "a"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Reference, Symbol: 1}}, {Name: "B", Value: &syntax.Expr{Kind: syntax.Empty}}, }, }}, {`A: b=a;`, &syntax.Model{ - Terminals: []string{"EOI", "a"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Assign, Name: "b", Sub: []*syntax.Expr{ {Kind: syntax.Reference, Symbol: 1}, @@ -145,7 +145,7 @@ var parserTests = []struct { }, }}, {`A: a a -> foo;`, &syntax.Model{ - Terminals: []string{"EOI", "a"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Arrow, Name: "foo", Sub: []*syntax.Expr{ {Kind: syntax.Sequence, Sub: []*syntax.Expr{ @@ -156,7 +156,7 @@ var parserTests = []struct { }, }}, {`A: b c+; B: (A separator b)*?;`, &syntax.Model{ - Terminals: []string{"EOI", "b", "c"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "b"}, {Name: "c"}}, Nonterms: []*syntax.Nonterm{ { Name: "A", @@ -179,7 +179,7 @@ var parserTests = []struct { }, }}, {`%flag T; %lookahead flag V = true; A {foo}: a B; B:[T!=123];`, &syntax.Model{ - Terminals: []string{"EOI", "a"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}}, Params: []syntax.Param{ {Name: "T"}, {Name: "V", DefaultValue: "true", Lookahead: true}, @@ -202,7 +202,7 @@ var parserTests = []struct { }, }}, {`%flag A; %flag B; input: [A==false && B || !A] a | b;`, &syntax.Model{ - Terminals: []string{"EOI", "a", "b"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}, {Name: "b"}}, Params: []syntax.Param{ {Name: "A"}, {Name: "B"}, @@ -228,7 +228,7 @@ var parserTests = []struct { }, }}, {`A: set(a & B | c | ~first B & precede B & last P & follow P & ~Q); B: z; P:; Q:;`, &syntax.Model{ - Terminals: []string{"EOI", "a", "c", "z"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}, {Name: "c"}, {Name: "z"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Set, SetIndex: 0}}, {Name: "B", Value: &syntax.Expr{Kind: syntax.Reference, Symbol: 3}}, @@ -258,7 +258,7 @@ var parserTests = []struct { }}, }}, {`A: (?= A) a;`, &syntax.Model{ - Terminals: []string{"EOI", "a"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Sequence, Sub: []*syntax.Expr{ {Kind: syntax.Lookahead, Sub: []*syntax.Expr{ @@ -269,7 +269,7 @@ var parserTests = []struct { }, }}, {`A: (?= P & !Q) a b; P: a; Q: b;`, &syntax.Model{ - Terminals: []string{"EOI", "a", "b"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}, {Name: "b"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Sequence, Sub: []*syntax.Expr{ {Kind: syntax.Lookahead, Sub: []*syntax.Expr{ @@ -286,7 +286,7 @@ var parserTests = []struct { }, }}, {`%interface Q, P; A: a;`, &syntax.Model{ - Terminals: []string{"EOI", "a"}, + Terminals: []syntax.Terminal{{Name: "EOI"}, {Name: "a"}}, Nonterms: []*syntax.Nonterm{ {Name: "A", Value: &syntax.Expr{Kind: syntax.Reference, Symbol: 1}}, }, @@ -354,7 +354,7 @@ func initSymbols(input string, out *syntax.Model) error { var l tm.Lexer l.Init(input) seen := make(map[string]bool) - out.Terminals = []string{"EOI"} + out.Terminals = []syntax.Terminal{{Name: "EOI"}} out.Nonterms = nil var prev token.Type for tok := l.Next(); tok != token.EOI; tok = l.Next() { @@ -373,7 +373,7 @@ func initSymbols(input string, out *syntax.Model) error { if isTerm(l.Text()) { if !seen[l.Text()] { - out.Terminals = append(out.Terminals, l.Text()) + out.Terminals = append(out.Terminals, syntax.Terminal{Name: l.Text()}) } seen[l.Text()] = true } else { @@ -568,7 +568,7 @@ func (p *parser) parseTermRef() int { p.errorf("terminal reference is expected (found %q)", name) } for i, val := range p.out.Terminals { - if val == name { + if val.Name == name { return i } } @@ -892,4 +892,4 @@ func (p *parser) parseSetPrimary() *syntax.TokenSet { ret = &syntax.TokenSet{Kind: syntax.Complement, Sub: []*syntax.TokenSet{ret}, Origin: tilde} } return ret -} +} \ No newline at end of file From b699846f2b3130795cf5aedf745bfbd1f4e76ce2 Mon Sep 17 00:00:00 2001 From: Shannon Rae <166186361+secretlyshannon@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:23:45 -0700 Subject: [PATCH 3/6] Update templates_test.go --- syntax/templates_test.go | 42 ++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/syntax/templates_test.go b/syntax/templates_test.go index 9d5339c68..f78110e94 100644 --- a/syntax/templates_test.go +++ b/syntax/templates_test.go @@ -9,6 +9,10 @@ import ( "github.com/inspirer/textmapper/util/dump" ) +func expand(m *syntax.Model) error { + return syntax.Expand(m, &syntax.ExpandOptions{}) +} + var modelTests = []struct { fnName string fn func(m *syntax.Model) error @@ -98,79 +102,79 @@ var modelTests = []struct { }, // Syntax sugar expansion. - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a?;`, `Z: a | ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a? | b?;`, `Z: a | | b ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: (a | b)?;`, `Z: a | b | ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: (a b?)?;`, `Z: a b | a | ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: (a b|b) (c|d);`, `Z: a b c | a b d | b c | b d ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a? %prec b ;`, `Z: a %prec b | %prec b ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a? -> A ;`, `Z: a -> A | -> A ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a=a? ;`, `Z: a=a | ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a? {Foo} -> A ;`, `Z: a {Foo} -> A | {Foo} -> A ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a+ | q ;`, `A_list: A_list a | a; Z: A_list | q ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a* | q ;`, `A_optlist: A_optlist a | ; Z: A_optlist | q ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: b | (a separator b)+ ;`, `A_list_B_separated: A_list_B_separated b a | a; Z: b | A_list_B_separated ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: b | (a separator b)* ;`, `A_list_B_separated: A_list_B_separated b a | a; A_list_B_separatedopt: A_list_B_separated | ; Z: b | A_list_B_separatedopt ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: set(a | ~b);`, `Z: set(a | ~b);`, // top level sets are not expanded }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a b set(a | ~b) | c ;`, `Z: a b setof_a_or_not_b | c ; setof_a_or_not_b: set(a | ~b) ;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: (?= A); A:a|b;`, `Z: (?= A); A:a|b;`, // top level lookaheads are not expanded }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: a (?= A & !B) b | c; A: a|b; B: a|b;`, `Z: a lookahead_A_notB b | c; lookahead_A_notB: (?= A & !B); A: a|b; B: a|b;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `Z: A+ | C+ | B+; A: a|x; B: b|y; C: c|z;`, // sorting test `A_list: A_list A | A; B_list: B_list B | B; C_list: C_list C | C; Z: A_list | C_list | B_list; A: a|x; B: b|y; C: c|z;`, }, - {"Expand", syntax.Expand, + {"Expand", expand, `%input X; X: B+ | Y+ | A+; A: a|x; B: b|y; Y: c|z;`, // sorting test #2 `%input X; A_list: A_list A | A; B_list: B_list B | B; X: B_list | Y_list | A_list; Y_list: Y_list Y | Y; A: a|x; B: b|y; Y: c|z;`, }, From fa221081c6364c5f69bc86bd8ffc089ebb3a7fc9 Mon Sep 17 00:00:00 2001 From: Shannon Rae <166186361+secretlyshannon@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:26:04 -0700 Subject: [PATCH 4/6] Update imports in compiler_test.go --- compiler/compiler_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/compiler/compiler_test.go b/compiler/compiler_test.go index 301eef1d3..5a3903738 100644 --- a/compiler/compiler_test.go +++ b/compiler/compiler_test.go @@ -8,7 +8,7 @@ import ( "strings" "testing" - "github.com/inspirer/textmapper/grammar/grammar" + "github.com/inspirer/textmapper/grammar" "github.com/inspirer/textmapper/parsers/parsertest" "github.com/inspirer/textmapper/parsers/tm" "github.com/inspirer/textmapper/parsers/tm/ast" From 0ae2c10be188680dbea3d64d76ad14e8bc8200b6 Mon Sep 17 00:00:00 2001 From: Shannon Rae <166186361+secretlyshannon@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:28:15 -0700 Subject: [PATCH 5/6] Add import of "sort" to compiler_test.go. --- compiler/compiler_test.go | 1 + 1 file changed, 1 insertion(+) diff --git a/compiler/compiler_test.go b/compiler/compiler_test.go index 5a3903738..185a4a82a 100644 --- a/compiler/compiler_test.go +++ b/compiler/compiler_test.go @@ -5,6 +5,7 @@ import ( "fmt" "os" "path/filepath" + "sort" "strings" "testing" From d1c4fba9cd5ed62226cfafdce96338643a0aec6b Mon Sep 17 00:00:00 2001 From: Shannon Rae <166186361+secretlyshannon@users.noreply.github.com> Date: Mon, 28 Apr 2025 15:31:30 -0700 Subject: [PATCH 6/6] Add disabled_syntax.tmerr --- compiler/testdata/disabled_syntax.tmerr | 53 +++++++++++++++++++++++++ 1 file changed, 53 insertions(+) create mode 100644 compiler/testdata/disabled_syntax.tmerr diff --git a/compiler/testdata/disabled_syntax.tmerr b/compiler/testdata/disabled_syntax.tmerr new file mode 100644 index 000000000..b1df7bdaa --- /dev/null +++ b/compiler/testdata/disabled_syntax.tmerr @@ -0,0 +1,53 @@ +language parser(go); + +disableSyntax = ["Lookahead", "Arrow", "Templates", "NestedChoice"] + +:: lexer + +a: /a/ +b: /b/ +c: /c/ +d: /d/ + +:: parser + +input: A1 B1 C1 D1 E1 F1; + +A1: «(?= laA)» a; +# err: syntax Lookahead is not supported + +laA: a b c d; + +B1: «(?= laB)» b; +# err: syntax Lookahead is not supported + +laB: a b d; + +C1: «(?= laC)» c; +# err: syntax Lookahead is not supported + +laC: laA | laB ; + +# Note: reusing laC again. + +D1: «(?= laC)» d; +# err: syntax Lookahead is not supported + +E1: «(?= laE)» d; +# err: syntax Lookahead is not supported + +laE: a+ b; + +F1: «(?= laF)» d; +# err: syntax Lookahead is not supported + +laF «-> Thing»: laE b; +# err: syntax Arrow is not supported + +%flag T; + +«g»: F1; +# err: templates are not supported + +h : F1 | (F1 F1) | «(F1 | F1 F1)»; +# err: parenthesized Choice operator is not supported