Export an array of all tokens from ct_token_map

taminomara · taminomara · commit 49ba5e3a8d8e · 2025-05-24T18:53:18.000+04:00
This helps with writing structured input adapters for fuzzing. When fuzzing a parser specifically (as opposed to fuzzing lexer and parser at the same time), we'd like to supply it with an array of valid lexemes. This export helps us build such an array as we don't have to manually list all tokens in a fuzzing entry point.

Note that I didn't implement this functionality for generated lexers because there's already a way to get all tokens via `mod_l::lexerdef().iter_rules()`.
diff --git a/lrlex/src/lib/ctbuilder.rs b/lrlex/src/lib/ctbuilder.rs
@@ -1215,31 +1215,36 @@ pub fn ct_token_map<StorageT: Display + ToTokens>(
     let timestamp = env!("VERGEN_BUILD_TIMESTAMP");
     let mod_ident = format_ident!("{}", mod_name);
     write!(outs, "// lrlex build time: {}\n\n", quote!(#timestamp),).ok();
+    let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
     // Sort the tokens so that they're always in the same order.
     // This will prevent unneeded rebuilds.
     let mut token_map_sorted = Vec::from_iter(token_map.borrow().iter());
     token_map_sorted.sort_by_key(|(k, _)| *k);
-    let tokens = &token_map_sorted
-        .into_iter()
+    let (token_array, tokens): (TokenStream, TokenStream) = token_map_sorted
+        .iter()
         .map(|(k, id)| {
             let name = match rename_map {
                 Some(rmap) => *rmap.get(k.as_str()).unwrap_or(&k.as_str()),
-                _ => k,
+                _ => &k,
             };
             let tok_ident = format_ident!("T_{}", name.to_ascii_uppercase());
-            let storaget = str::parse::<TokenStream>(type_name::<StorageT>()).unwrap();
-            // Code gen for the constant token values.
-            quote! {
-                pub const #tok_ident: #storaget = #id;
-            }
+            (
+                quote! {
+                    #tok_ident,
+                },
+                quote! {
+                    pub const #tok_ident: #storaget = #id;
+                },
+            )
         })
-        .collect::<Vec<_>>();
+        .unzip();
     // Since the formatter doesn't preserve comments and we don't want to lose build time,
     // just format the module contents.
     let unformatted = quote! {
         mod #mod_ident {
             #![allow(dead_code)]
-            #(#tokens)*
+            #tokens
+            pub const TOK_IDS: &[#storaget] = &[#token_array];
         }
     }
     .to_string();