facebookresearch
diff --git a/‎doc/source/basics/cli.rst
+4-4 b/‎doc/source/basics/cli.rst
+4-4
diff --git a/‎src/fairseq2/assets/cards/models/llama.yaml
-4 b/‎src/fairseq2/assets/cards/models/llama.yaml
-4
diff --git a/‎src/fairseq2/assets/cards/models/s2t_transformer.yaml
+3-8 b/‎src/fairseq2/assets/cards/models/s2t_transformer.yaml
+3-8
diff --git a/‎src/fairseq2/checkpoint/_metadata_provider.py
+2-9 b/‎src/fairseq2/checkpoint/_metadata_provider.py
+2-9
diff --git a/‎src/fairseq2/cli/commands/chatbot.py
+29-18 b/‎src/fairseq2/cli/commands/chatbot.py
+29-18
diff --git a/‎src/fairseq2/data/text/tokenizers/nllb.py
+2-2 b/‎src/fairseq2/data/text/tokenizers/nllb.py
+2-2
diff --git a/‎src/fairseq2/data/text/tokenizers/s2t_transformer.py
+2-2 b/‎src/fairseq2/data/text/tokenizers/s2t_transformer.py
+2-2
diff --git a/‎src/fairseq2/data/text/tokenizers/sentencepiece.py
+3-3 b/‎src/fairseq2/data/text/tokenizers/sentencepiece.py
+3-3
diff --git a/‎src/fairseq2/datasets/_utils.py
+1-1 b/‎src/fairseq2/datasets/_utils.py
+1-1
diff --git a/‎src/fairseq2/datasets/asr.py
+1-1 b/‎src/fairseq2/datasets/asr.py
+1-1
diff --git a/‎src/fairseq2/datasets/instruction.py
+1-1 b/‎src/fairseq2/datasets/instruction.py
+1-1
diff --git a/‎src/fairseq2/datasets/parallel_text.py
+28-20 b/‎src/fairseq2/datasets/parallel_text.py
+28-20
diff --git a/‎src/fairseq2/datasets/preference.py
+1-1 b/‎src/fairseq2/datasets/preference.py
+1-1
@@ -79,7 +79,7 @@ or add, delete values:
     fairseq2 lm instruction_finetune <OUTPUT_DIR> --config del:common.metric_recorders.tensorboard
 
     # Add a configuration key
-    fairseq2 lm instruction_finetune <OUTPUT_DIR> --config add:common.metric_recorders.tensorboard="{enabled: true}"
+    fairseq2 lm instruction_finetune <OUTPUT_DIR> --config set:common.metric_recorders.tensorboard="{enabled: true}"
 
 .. note::
 
@@ -88,12 +88,12 @@ or add, delete values:
 3. Adding and Removing Values
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Use ``add:`` and ``del:`` directives for more advanced configuration:
+Use ``set:`` and ``del:`` directives for more advanced configuration:
 
 .. code-block:: bash
 
     # Add a new configuration value
-    fairseq2 lm instruction_finetune <OUTPUT_DIR> --config add:new_param=value
+    fairseq2 lm instruction_finetune <OUTPUT_DIR> --config set:new_param=value
 
     # Remove a configuration value
     fairseq2 lm instruction_finetune <OUTPUT_DIR> --config del:unwanted_param
@@ -110,7 +110,7 @@ You can combine all these methods, with later values taking precedence:
         --config-file override.yaml \
         --config max_num_tokens=512 \
         optimizer_config.lr=4e-5 \
-        add:custom_param=value
+        set:custom_param=value
 
 Asset Management
 ----------------
 
@@ -75,10 +75,6 @@ use_v2_tokenizer: true
 
 name: llama3_instruct
 base: llama3
-model_config:
-  vocab_info:
-    _set_:
-      eos_idx: 128009  # EOT (end-of-turn)
 use_eot: true  # instruct tokenizer to use EOT instead of EOS
 
 ---
 
@@ -8,8 +8,7 @@ name: s2t_transformer_mustc_asr_de_s
 model_family: s2t_transformer
 model_arch: small
 model_config:
-  target_vocab_info:
-    size: 5000
+  target_vocab_size: 5000
 task: transcription
 target_langs: [en]
 checkpoint: "https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_asr_transformer_s.pt"
@@ -23,9 +22,7 @@ name: s2t_transformer_mustc_asr_es_s
 model_family: s2t_transformer
 model_arch: small
 model_config:
-  target_vocab_info:
-    _set_:
-      size: 5000
+  target_vocab_size: 5000
 task: transcription
 target_langs: [en]
 checkpoint: "https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_es_asr_transformer_s.pt"
@@ -51,9 +48,7 @@ name: s2t_transformer_mustc_st_de_s
 model_family: s2t_transformer
 model_arch: small
 model_config:
-  target_vocab_info:
-    _set_:
-      size: 8000
+  target_vocab_size: 8000
 task: translation
 target_langs: [de]
 checkpoint: "https://dl.fbaipublicfiles.com/fairseq/s2t/mustc_de_st_transformer_s.pt"
 
@@ -28,9 +28,7 @@
 
 class CheckpointMetadataSaver(ABC):
     @abstractmethod
-    def save(
-        self, model_family: str, model_config: object, tokenizer_name: str | None = None
-    ) -> None: ...
+    def save(self, model_family: str, model_config: object) -> None: ...
 
 
 @final
@@ -52,9 +50,7 @@ def __init__(
         self._file_system = file_system
         self._yaml_dumper = yaml_dumper
 
-    def save(
-        self, model_family: str, model_config: object, tokenizer_name: str | None = None
-    ) -> None:
+    def save(self, model_family: str, model_config: object) -> None:
         if self._gangs.root.rank == 0:
             unstructured_config = unstructure(model_config)
 
@@ -66,9 +62,6 @@ def save(
                 },
             }
 
-            if tokenizer_name is not None:
-                metadata["tokenizer_ref"] = tokenizer_name
-
             if self._gangs.tp.size != 1:
                 metadata["num_shards"] = self._gangs.tp.size
 
 
@@ -38,7 +38,12 @@
     setup_reference_model,
     setup_torch,
 )
-from fairseq2.recipes.config import CommonSection, GangSection, ReferenceModelSection
+from fairseq2.recipes.config import (
+    CommonSection,
+    GangSection,
+    ReferenceModelSection,
+    TextTokenizerSection,
+)
 from fairseq2.typing import CPU
 from fairseq2.utils.rng import RngBag
 
@@ -113,33 +118,33 @@ def run(
 
         view = CliChatbotView(args.model_name, console)
 
-        args.gang = GangSection(
-            tensor_parallel_size=args.tensor_parallel_size, timeout=999
-        )
+        set_torch_distributed_variables(context, args.cluster)
 
-        args.model = ReferenceModelSection(name=args.model_name)
+        common_section = CommonSection()
 
-        args.common = CommonSection()
+        setup_torch(context, common_section, output_dir=None)
 
-        set_torch_distributed_variables(context, args.cluster)
-
-        setup_torch(context, args)
+        gang_section = GangSection(
+            tensor_parallel_size=args.tensor_parallel_size, timeout=999
+        )
 
         try:
-            gangs = setup_gangs(context, args)
+            gangs = setup_gangs(context, gang_section)
         except RecipeError as ex:
             raise CliCommandError(
                 "The chatbot setup has failed. See the nested exception for details."
             ) from ex
 
         if gangs.dp.size > 1:
-            log.warning("Using redundant data parallelism which may reduce throughput. It is recommended to use one device per model (shard).")  # fmt: skip
+            log.warning("Using redundant data parallelism which may reduce throughput.")  # fmt: skip
+
+        model_section = ReferenceModelSection(name=args.model_name)
 
         try:
             model = setup_reference_model(
                 DecoderModel,
                 context,
-                args.model_name,
+                model_section,
                 gangs,
                 args.dtype,
                 mp=False,
@@ -152,19 +157,25 @@ def run(
 
         module = cast(DecoderModel, model.module)
 
-        sampler = TopPSampler(p=args.top_p)
-
-        generator = SamplingSequenceGenerator(
-            module, sampler, temperature=args.temperature, max_gen_len=args.max_gen_len
-        )
+        tokenizer_section = TextTokenizerSection(name=args.model_name)
 
         try:
-            tokenizer = load_text_tokenizer(context, args)
+            tokenizer = load_text_tokenizer(context, tokenizer_section)
         except RecipeError as ex:
             raise CliCommandError(
                 "The chatbot setup has failed. See the nested exception for details."
             ) from ex
 
+        sampler = TopPSampler(p=args.top_p)
+
+        generator = SamplingSequenceGenerator(
+            module,
+            tokenizer.vocab_info,
+            sampler,
+            temperature=args.temperature,
+            max_gen_len=args.max_gen_len,
+        )
+
         card = context.asset_store.retrieve_card(args.model_name)
 
         family = card.field("model_family").as_(str)
 
@@ -23,7 +23,7 @@
     SentencePieceDecoder,
     SentencePieceEncoder,
     SentencePieceModel,
-    vocab_info_from_sentencepiece,
+    get_sentencepiece_vocabulary_info,
 )
 from fairseq2.typing import Device
 
@@ -63,7 +63,7 @@ def __init__(self, path: Path, langs: Sequence[str], default_lang: str) -> None:
 
         self._default_lang = default_lang
 
-        self._vocab_info = vocab_info_from_sentencepiece(self._model)
+        self._vocab_info = get_sentencepiece_vocabulary_info(self._model)
 
     @override
     def create_encoder(
 
@@ -22,7 +22,7 @@
     SentencePieceDecoder,
     SentencePieceEncoder,
     SentencePieceModel,
-    vocab_info_from_sentencepiece,
+    get_sentencepiece_vocabulary_info,
 )
 from fairseq2.typing import Device
 
@@ -62,7 +62,7 @@ def __init__(
         self._target_langs = target_langs
         self._default_target_lang = default_target_lang
 
-        self._vocab_info = vocab_info_from_sentencepiece(self._model)
+        self._vocab_info = get_sentencepiece_vocabulary_info(self._model)
 
     @override
     def create_encoder(
 
@@ -125,7 +125,7 @@ class BasicSentencePieceTokenizer(TextTokenizer):
     def __init__(self, path: Path) -> None:
         self._model = SentencePieceModel(path)
 
-        self._vocab_info = vocab_info_from_sentencepiece(self._model)
+        self._vocab_info = get_sentencepiece_vocabulary_info(self._model)
 
     @override
     def create_encoder(
@@ -216,7 +216,7 @@ class RawSentencePieceTokenizer(TextTokenizer):
     def __init__(self, path: Path) -> None:
         self._model = SentencePieceModel(path)
 
-        self._vocab_info = vocab_info_from_sentencepiece(self._model)
+        self._vocab_info = get_sentencepiece_vocabulary_info(self._model)
 
     @override
     def create_encoder(
@@ -277,7 +277,7 @@ def load_raw_sentencepiece_tokenizer(path: Path, card: AssetCard) -> TextTokeniz
         ) from ex
 
 
-def vocab_info_from_sentencepiece(model: SentencePieceModel) -> VocabularyInfo:
+def get_sentencepiece_vocabulary_info(model: SentencePieceModel) -> VocabularyInfo:
     """Return the vocabulary information of ``model``."""
     return VocabularyInfo(
         model.vocabulary_size,
 
@@ -55,7 +55,7 @@ def _load_files_and_weights(
     manifest_file = path.joinpath("MANIFEST")
 
     try:
-        with manifest_file.open() as fp:
+        with manifest_file.open(encoding="utf-8") as fp:
             content = list(fp)
     except FileNotFoundError:
         content = None
 
@@ -289,7 +289,7 @@ def _retrieve_data_directory(self, split: str) -> Path:
         manifest_file = self._manifest_dir.joinpath(f"{split}.tsv")
 
         try:
-            with manifest_file.open() as fp:
+            with manifest_file.open(encoding="utf-8") as fp:
                 line = fp.readline().rstrip()
         except OSError as ex:
             raise DataReadError(
 
@@ -414,7 +414,7 @@ def _read_jsonl(self, path: Path, tokenizer: TextTokenizer) -> DataPipelineBuild
         lines = []
 
         # TODO(balioglu): Do in C++.
-        with path.open() as fp:
+        with path.open(encoding="utf-8") as fp:
             for line in fp:
                 lines.append(line)
 
 
@@ -14,6 +14,7 @@
 from typing_extensions import override
 
 from fairseq2.data import (
+    CollateOptionsOverride,
     Collater,
     DataPipeline,
     DataPipelineBuilder,
@@ -77,28 +78,23 @@ class ParallelTextDataset(ABC):
     def create_reader(
         self,
         split: str,
-        tokenizer: TextTokenizer,
+        source_tokenizer: TextTokenizer,
+        target_tokenizer: TextTokenizer,
         gang: Gang,
         min_seq_len: int,
         max_seq_len: int,
         options: ParallelTextReadOptions | None = None,
     ) -> DataReader[Seq2SeqBatch]:
         """Create a dataset reader.
 
-        :param split:
-            The split to read.
-        :param tokenizer:
-            The tokenizer to encode text.
-        :param gang:
-            The gang over which to shard the dataset.
-        :param min_seq_len:
-            The minimum sequence length of each example. Examples shorter than
-            this value will be dropped.
-        :param max_seq_len:
-            The maximum sequence length of each example. Examples longer than
-            this value will be dropped.
-        :param options:
-            The read options.
+        :param split: The split to read.
+        :param source_tokenizer: The tokenizer to encode source text.
+        :param gang: The gang over which to shard the dataset.
+        :param min_seq_len: The minimum sequence length of each example.
+            Examples shorter than this value will be dropped.
+        :param max_seq_len: The maximum sequence length of each example.
+            Examples longer than this value will be dropped.
+        :param options: The read options.
         """
 
     @abstractmethod
@@ -171,7 +167,7 @@ def from_path(cls, path: Path, name: str) -> GenericParallelTextDataset:
             manifest_file = path.joinpath(split).joinpath("MANIFEST")
 
             try:
-                with manifest_file.open() as fp:
+                with manifest_file.open(encoding="utf-8") as fp:
                     content = list(fp)
             except OSError as ex:
                 raise DatasetLoadError(
@@ -252,7 +248,8 @@ def value_error() -> ValueError:
     def create_reader(
         self,
         split: str,
-        tokenizer: TextTokenizer,
+        source_tokenizer: TextTokenizer,
+        target_tokenizer: TextTokenizer,
         gang: Gang,
         min_seq_len: int,
         max_seq_len: int,
@@ -289,11 +286,11 @@ def create_reader(
             if direction.origin:
                 source_mode = f"{source_mode}_{direction.origin}"
 
-            source_encoder = tokenizer.create_encoder(
+            source_encoder = source_tokenizer.create_encoder(
                 task="translation", lang=direction.source_lang, mode=source_mode
             )
 
-            target_encoder = tokenizer.create_encoder(
+            target_encoder = target_tokenizer.create_encoder(
                 task="translation", lang=direction.target_lang, mode="target"
             )
 
@@ -384,7 +381,18 @@ def skip(example: dict[str, Any]) -> bool:
         seed += 1
 
         # Collate bucketed examples into a batch.
-        collater = Collater(pad_value=tokenizer.vocab_info.pad_idx)
+        collater = Collater(
+            overrides=[
+                CollateOptionsOverride(
+                    selector="source_indices",
+                    pad_value=source_tokenizer.vocab_info.pad_idx,
+                ),
+                CollateOptionsOverride(
+                    selector="target_indices",
+                    pad_value=target_tokenizer.vocab_info.pad_idx,
+                ),
+            ]
+        )
 
         builder.map(collater, num_parallel_calls=npc)
 
 
@@ -377,7 +377,7 @@ def _read_jsonl(self, path: Path, tokenizer: TextTokenizer) -> DataPipelineBuild
         lines = []
 
         # TODO(balioglu): Do in C++.
-        with path.open() as fp:
+        with path.open(encoding="utf-8") as fp:
             for line in fp:
                 lines.append(line)