From c332b368e70a2af1417ab9fc5cfb2e8cd0a4d4dd Mon Sep 17 00:00:00 2001 From: Dennis Wei Date: Fri, 12 Sep 2025 21:41:15 -0700 Subject: [PATCH 1/5] allow for more complex mappings between input units and parts of chat template Signed-off-by: Dennis Wei --- icx360/utils/model_wrappers/vllm.py | 87 ++++++++++++++++++++++++----- icx360/utils/scalarizers/prob.py | 8 ++- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/icx360/utils/model_wrappers/vllm.py b/icx360/utils/model_wrappers/vllm.py index 73ef0cc..0e5db7f 100644 --- a/icx360/utils/model_wrappers/vllm.py +++ b/icx360/utils/model_wrappers/vllm.py @@ -36,7 +36,7 @@ def __init__(self, model, model_name, tokenizer=None): self._model_name = model_name self._tokenizer = tokenizer - def convert_input(self, inputs, chat_template=False, system_prompt=None, **kwargs): + def convert_input(self, inputs, chat_template=False, system_prompt=None, unit_ranges=None, **kwargs): """ Convert input(s) into a list of strings. @@ -47,6 +47,8 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, **kwarg Whether to apply chat template. system_prompt (str or None): System prompt to include in chat template. + unit_ranges (dict or None): + Mapping from chat template parts to ranges of input units. Returns: inputs (List[str]): @@ -55,30 +57,83 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, **kwarg if isinstance(inputs, str): # Single input text, convert to list inputs = [inputs] - elif isinstance(inputs, list): - if isinstance(inputs[0], list): - # Join segmented texts - inputs = ["".join(inp) for inp in inputs] - else: + elif not isinstance(inputs, list): raise TypeError("Inputs must be a string or list for VLLMModel") if chat_template: if self._tokenizer is None: raise TypeError("HuggingFace tokenizer must be provided to apply chat template") - # Construct chat messages - if system_prompt is not None: - messages = [[{"role": "system", "content": system_prompt}, - {"role": "user", "content": inp}] for inp in inputs] + if isinstance(inputs, list) and isinstance(inputs[0], list) and unit_ranges is not None: + # Inputs are segmented into units and a mapping from chat template parts to units is given + inputs = self._construct_chat_template_from_mapping(inputs, unit_ranges) else: - messages = [[{"role": "user", "content": inp}] for inp in inputs] + if isinstance(inputs, list) and isinstance(inputs[0], list): + # Inputs are segmented into units but no mapping given, just join units + inputs = ["".join(inp) for inp in inputs] + + # Construct chat messages, placing each input into a single user message + if system_prompt is not None: + messages = [[{"role": "system", "content": system_prompt}, + {"role": "user", "content": inp}] for inp in inputs] + else: + messages = [[{"role": "user", "content": inp}] for inp in inputs] - # Apply chat template - inputs = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + # Apply chat template + inputs = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) return inputs - def generate(self, inputs, chat_template=False, system_prompt=None, text_only=True, **kwargs): + def _construct_chat_template_from_mapping(self, inputs, unit_ranges): + """ + Construct chat template given mapping from parts of the chat template to input units. + + Args: + inputs (List[List[str]]): + A list of input texts segmented into units. + unit_ranges (dict): + Mapping from chat template parts to ranges of input units. + + Returns: + inputs_formatted (List[str]): + List of inputs formatted according to chat template. + """ + inputs_formatted = [] + # Iterate over inputs + for inp in inputs: + + # Construct conversation turn by turn + conversation = [] + for turn_ranges in unit_ranges["conversation"]: + turn = {} + for key, rng in turn_ranges.items(): + # There should be only one range per turn + turn["role"] = key + turn["content"] = "".join(inp[rng[0] : rng[1]]) + conversation.append(turn) + + if "documents" in unit_ranges: + # Construct documents + documents = [] + for doc_id, doc_ranges in enumerate(unit_ranges["documents"]): + document = {"doc_id": doc_id + 1} + for key, rng in doc_ranges.items(): + # Document text and possibly a title + document[key] = "".join(inp[rng[0] : rng[1]]) + documents.append(document) + else: + documents = None + + # Construct chat template from conversation and documents + input_formatted = self._tokenizer.apply_chat_template(conversation, + documents=documents, + add_generation_prompt=True, + tokenize=False) + inputs_formatted.append(input_formatted) + + return inputs_formatted + + def generate(self, inputs, chat_template=False, system_prompt=None, text_only=True, unit_ranges=None, **kwargs): """ Generate response from model. @@ -91,6 +146,8 @@ def generate(self, inputs, chat_template=False, system_prompt=None, text_only=Tr System prompt to include in chat template. text_only (bool): Return only generated text (default) or an object containing additional outputs. + unit_ranges (dict or None): + Mapping from chat template parts to ranges of input units. **kwargs (dict): Additional keyword arguments for VLLM model. @@ -101,7 +158,7 @@ def generate(self, inputs, chat_template=False, system_prompt=None, text_only=Tr output_text: List of generated texts. """ # Convert input into list of strings if needed - inputs = self.convert_input(inputs, chat_template, system_prompt) + inputs = self.convert_input(inputs, chat_template, system_prompt, unit_ranges) # Generate output output_text = [] diff --git a/icx360/utils/scalarizers/prob.py b/icx360/utils/scalarizers/prob.py index 18d9138..be3cabe 100644 --- a/icx360/utils/scalarizers/prob.py +++ b/icx360/utils/scalarizers/prob.py @@ -40,7 +40,9 @@ def __init__(self, model): if not isinstance(model, HFModel) and not isinstance(model, VLLMModel): raise TypeError("Model must be a HFModel (HuggingFace) or VLLMModel for ProbScalarizedModel") - def scalarize_output(self, inputs=None, outputs=None, ref_input=None, ref_output=None, chat_template=False, system_prompt=None, tokenizer_kwargs={}, transformation="log_prob_mean", **kwargs): + def scalarize_output(self, inputs=None, outputs=None, ref_input=None, ref_output=None, + chat_template=False, system_prompt=None, unit_ranges=None, tokenizer_kwargs={}, + transformation="log_prob_mean", **kwargs): """ Compute probability of generating reference output (or each unit thereof) conditioned on inputs. @@ -58,6 +60,8 @@ def scalarize_output(self, inputs=None, outputs=None, ref_input=None, ref_output Whether to apply chat template. system_prompt (str or None): System prompt to include in chat template. + unit_ranges (dict or None): + Mapping from chat template parts to ranges of input units. tokenizer_kwargs (dict): Additional keyword arguments for tokenizer. transformation (str, optional): @@ -77,7 +81,7 @@ def scalarize_output(self, inputs=None, outputs=None, ref_input=None, ref_output if inputs is None: raise ValueError("inputs must be provided for ProbScalarizedModel.scalarize_output()") else: - inputs = self.model.convert_input(inputs, chat_template, system_prompt, **tokenizer_kwargs) + inputs = self.model.convert_input(inputs, chat_template, system_prompt, unit_ranges, **tokenizer_kwargs) # Check for reference output if ref_output is None: raise ValueError("ref_output must be provided for ProbScalarizedModel.scalarize_output()") From 0e81f1dd67d8723af5c82f9d15fd1e24e49c5cc2 Mon Sep 17 00:00:00 2001 From: Dennis Wei Date: Mon, 29 Sep 2025 12:38:37 -0700 Subject: [PATCH 2/5] join segmented units in VLLMModel.convert_input() if chat_template=False - This case was mistakenly dropped by a previous change Signed-off-by: Dennis Wei --- icx360/utils/model_wrappers/vllm.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/icx360/utils/model_wrappers/vllm.py b/icx360/utils/model_wrappers/vllm.py index 0e5db7f..a0050da 100644 --- a/icx360/utils/model_wrappers/vllm.py +++ b/icx360/utils/model_wrappers/vllm.py @@ -81,6 +81,10 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, unit_ra # Apply chat template inputs = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True, tokenize=False) + else: + if isinstance(inputs, list) and isinstance(inputs[0], list): + # Join segmented units + inputs = ["".join(inp) for inp in inputs] return inputs From c5ea1980e1fdc930dbb6ac74d2f5e920f957ef0e Mon Sep 17 00:00:00 2001 From: Dennis Wei Date: Thu, 18 Dec 2025 18:09:49 -0800 Subject: [PATCH 3/5] mappings between input units and parts of chat template for HFModel Signed-off-by: Dennis Wei --- icx360/utils/model_wrappers/huggingface.py | 99 +++++++++++++++++----- 1 file changed, 77 insertions(+), 22 deletions(-) diff --git a/icx360/utils/model_wrappers/huggingface.py b/icx360/utils/model_wrappers/huggingface.py index ac7fcd3..c20dce6 100644 --- a/icx360/utils/model_wrappers/huggingface.py +++ b/icx360/utils/model_wrappers/huggingface.py @@ -37,7 +37,7 @@ def __init__(self, model, tokenizer): self._tokenizer = tokenizer self._device = model.device - def convert_input(self, inputs, chat_template=False, system_prompt=None, **kwargs): + def convert_input(self, inputs, chat_template=False, system_prompt=None, unit_ranges=None, **kwargs): """ Encode input text as token IDs for HuggingFace model. @@ -48,6 +48,8 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, **kwarg Whether to apply chat template. system_prompt (str or None): System prompt to include in chat template. + unit_ranges (dict or None): + Mapping from chat template parts to ranges of input units. **kwargs (dict): Additional keyword arguments for tokenizer. @@ -60,37 +62,88 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, **kwarg # Batch of strings, enable padding and truncation kwargs["padding"] = True kwargs["truncation"] = True - if isinstance(inputs[0], list): - if chat_template: - # Join segmented strings - inputs = ["".join(inp) for inp in inputs] - else: - # Indicate to tokenizer that strings are segmented - kwargs["is_split_into_words"] = True + if isinstance(inputs[0], list) and not chat_template: + # Indicate to tokenizer that strings are segmented + kwargs["is_split_into_words"] = True if chat_template: - # Construct chat messages - if isinstance(inputs, list): - if system_prompt is not None: - messages = [[{"role": "system", "content": system_prompt}, - {"role": "user", "content": inp}] for inp in inputs] - else: - messages = [[{"role": "user", "content": inp}] for inp in inputs] + if isinstance(inputs, list) and isinstance(inputs[0], list) and unit_ranges is not None: + # Inputs are segmented into units and a mapping from chat template parts to units is given + messages, documents = self._construct_chat_template_from_mapping(inputs, unit_ranges) else: - if system_prompt is not None: - messages = [{"role": "system", "content": system_prompt}, - {"role": "user", "content": inputs}] + if isinstance(inputs, list) and isinstance(inputs[0], list): + # Inputs are segmented into units but no mapping given, just join units + inputs = ["".join(inp) for inp in inputs] + + # Construct chat messages + if isinstance(inputs, list): + if system_prompt is not None: + messages = [[{"role": "system", "content": system_prompt}, + {"role": "user", "content": inp}] for inp in inputs] + else: + messages = [[{"role": "user", "content": inp}] for inp in inputs] else: - messages = [{"role": "user", "content": inputs}] + if system_prompt is not None: + messages = [{"role": "system", "content": system_prompt}, + {"role": "user", "content": inputs}] + else: + messages = [{"role": "user", "content": inputs}] + documents = None + # Encode chat - input_encoding = self._tokenizer.apply_chat_template(messages, add_generation_prompt=True, return_dict=True, **kwargs).to(self._device) + input_encoding = self._tokenizer.apply_chat_template( + messages, documents, add_generation_prompt=True, return_dict=True, **kwargs).to(self._device) else: # Encode text input_encoding = self._tokenizer(inputs, **kwargs).to(self._device) return input_encoding - def generate(self, inputs, chat_template=False, system_prompt=None, tokenizer_kwargs={}, text_only=True, **kwargs): + def _construct_chat_template_from_mapping(self, inputs, unit_ranges): + """ + Construct chat template given mapping from parts of the chat template to input units. + + Args: + inputs (List[List[str]]): + A list of input texts segmented into units. + unit_ranges (dict): + Mapping from chat template parts to ranges of input units. + + Returns: + conversation (List[Dict]): + List of chat messages. + documents (List[Dict] or None): + List of documents. + """ + inputs_formatted = [] + # Iterate over inputs + for inp in inputs: + + # Construct conversation turn by turn + conversation = [] + for turn_ranges in unit_ranges["conversation"]: + turn = {} + for key, rng in turn_ranges.items(): + # There should be only one range per turn + turn["role"] = key + turn["content"] = "".join(inp[rng[0] : rng[1]]) + conversation.append(turn) + + if "documents" in unit_ranges: + # Construct documents + documents = [] + for doc_id, doc_ranges in enumerate(unit_ranges["documents"]): + document = {"doc_id": doc_id + 1} + for key, rng in doc_ranges.items(): + # Document text and possibly a title + document[key] = "".join(inp[rng[0] : rng[1]]) + documents.append(document) + else: + documents = None + + return conversation, documents + + def generate(self, inputs, chat_template=False, system_prompt=None, unit_ranges=None, tokenizer_kwargs={}, text_only=True, **kwargs): """ Generate response from model. @@ -101,6 +154,8 @@ def generate(self, inputs, chat_template=False, system_prompt=None, tokenizer_kw Whether to apply chat template. system_prompt (str or None): System prompt to include in chat template. + unit_ranges (dict or None): + Mapping from chat template parts to ranges of input units. tokenizer_kwargs (dict): Additional keyword arguments for tokenizer. text_only (bool): @@ -117,7 +172,7 @@ def generate(self, inputs, chat_template=False, system_prompt=None, tokenizer_kw output_token_count: Maximum number of generated tokens. """ # Encode input text as token IDs - inputs = self.convert_input(inputs, chat_template, system_prompt, **tokenizer_kwargs) + inputs = self.convert_input(inputs, chat_template, system_prompt, unit_ranges, **tokenizer_kwargs) num_inputs, input_length = inputs["input_ids"].shape if num_inputs == 1 or not torch.cuda.is_available(): From f12c94677dc842d40c1a4ccb27b88b853006f2a9 Mon Sep 17 00:00:00 2001 From: Dennis Wei Date: Thu, 18 Dec 2025 22:50:33 -0800 Subject: [PATCH 4/5] fix HFModel._construct_chat_template_from_mapping - return list of chat-formatted strings, then tokenize and encode them Signed-off-by: Dennis Wei --- icx360/utils/model_wrappers/huggingface.py | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/icx360/utils/model_wrappers/huggingface.py b/icx360/utils/model_wrappers/huggingface.py index c20dce6..462c352 100644 --- a/icx360/utils/model_wrappers/huggingface.py +++ b/icx360/utils/model_wrappers/huggingface.py @@ -69,7 +69,9 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, unit_ra if chat_template: if isinstance(inputs, list) and isinstance(inputs[0], list) and unit_ranges is not None: # Inputs are segmented into units and a mapping from chat template parts to units is given - messages, documents = self._construct_chat_template_from_mapping(inputs, unit_ranges) + inputs_formatted = self._construct_chat_template_from_mapping(inputs, unit_ranges) + # Encode chat + input_encoding = self._tokenizer(inputs_formatted, **kwargs).to(self._device) else: if isinstance(inputs, list) and isinstance(inputs[0], list): # Inputs are segmented into units but no mapping given, just join units @@ -88,11 +90,10 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, unit_ra {"role": "user", "content": inputs}] else: messages = [{"role": "user", "content": inputs}] - documents = None # Encode chat input_encoding = self._tokenizer.apply_chat_template( - messages, documents, add_generation_prompt=True, return_dict=True, **kwargs).to(self._device) + messages, add_generation_prompt=True, return_dict=True, **kwargs).to(self._device) else: # Encode text input_encoding = self._tokenizer(inputs, **kwargs).to(self._device) @@ -110,10 +111,8 @@ def _construct_chat_template_from_mapping(self, inputs, unit_ranges): Mapping from chat template parts to ranges of input units. Returns: - conversation (List[Dict]): - List of chat messages. - documents (List[Dict] or None): - List of documents. + inputs_formatted (List[str]): + List of inputs formatted according to chat template. """ inputs_formatted = [] # Iterate over inputs @@ -141,7 +140,14 @@ def _construct_chat_template_from_mapping(self, inputs, unit_ranges): else: documents = None - return conversation, documents + # Construct chat template from conversation and documents + input_formatted = self._tokenizer.apply_chat_template(conversation, + documents=documents, + add_generation_prompt=True, + tokenize=False) + inputs_formatted.append(input_formatted) + + return inputs_formatted def generate(self, inputs, chat_template=False, system_prompt=None, unit_ranges=None, tokenizer_kwargs={}, text_only=True, **kwargs): """ From 2d3ce91a651fae78cf484b8ff685eeb4923caf22 Mon Sep 17 00:00:00 2001 From: Dennis Wei Date: Thu, 18 Dec 2025 23:39:37 -0800 Subject: [PATCH 5/5] fix indentation Signed-off-by: Dennis Wei --- icx360/utils/model_wrappers/huggingface.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/icx360/utils/model_wrappers/huggingface.py b/icx360/utils/model_wrappers/huggingface.py index 462c352..87ffd8b 100644 --- a/icx360/utils/model_wrappers/huggingface.py +++ b/icx360/utils/model_wrappers/huggingface.py @@ -91,9 +91,9 @@ def convert_input(self, inputs, chat_template=False, system_prompt=None, unit_ra else: messages = [{"role": "user", "content": inputs}] - # Encode chat - input_encoding = self._tokenizer.apply_chat_template( - messages, add_generation_prompt=True, return_dict=True, **kwargs).to(self._device) + # Encode chat + input_encoding = self._tokenizer.apply_chat_template( + messages, add_generation_prompt=True, return_dict=True, **kwargs).to(self._device) else: # Encode text input_encoding = self._tokenizer(inputs, **kwargs).to(self._device)