From e29fac3b9a01da16fa41fbdaf8e17e88fda0df5b Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Tue, 16 Sep 2025 15:44:27 -0700 Subject: [PATCH 01/17] Add README for LF LLM demo --- llm/README.md | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 llm/README.md diff --git a/llm/README.md b/llm/README.md new file mode 100644 index 0000000..3b7b658 --- /dev/null +++ b/llm/README.md @@ -0,0 +1,2 @@ +# LLM Demo + From 053b8d713906ee55d9535aab92ae4fdc34285329 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Tue, 16 Sep 2025 16:33:49 -0700 Subject: [PATCH 02/17] Adding work in progress code files for an llm example. Files: llm.py, which calls the llama-2-7b-chat model for simple question and answer, agent_llm.lf, which takes in the user input calls llm agent 1 and llm agent 2. --- llm/src/agent_llm.lf | 44 +++++++++++++++++++++ llm/src/llm.py | 94 ++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 138 insertions(+) create mode 100644 llm/src/agent_llm.lf create mode 100644 llm/src/llm.py diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf new file mode 100644 index 0000000..e5c7f90 --- /dev/null +++ b/llm/src/agent_llm.lf @@ -0,0 +1,44 @@ +target Python{ + files: llm_textgeneration.py +}; + +preamble{= + from llm import agent1, agent2 + +=} + +reactor llm_a{ + + output user_in + reaction (startup)-> user_in{= + txt = input("Hey there!") + user_in.set(txt) + =} +} + +reactor llm_b{ + input llm_a_in + output llm_b_out + reaction (llm_a_in)-> llm_b_out{= + llm_b_out.set(llm_a_in.value) + =} +} + +main reactor{ + state response + user_response = new llm_a() + llm_response = new llm_b() + // call llm a to respond to user + reaction (user_response.user_in)->llm_response.llm_a_in{= + + response = agent1(user_response.user_in.value) + llm_response.llm_a_in.set(response) + =} + + //llm b to respond to what llm a generated + reaction (llm_response.llm_b_out){= + # llm_response.llm_a_in = response + agent2(llm_response.llm_b_out.value) + =} + +} \ No newline at end of file diff --git a/llm/src/llm.py b/llm/src/llm.py new file mode 100644 index 0000000..63b6234 --- /dev/null +++ b/llm/src/llm.py @@ -0,0 +1,94 @@ +### Import Libraries +import transformers +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig +from torch import cuda, bfloat16 + +### Add Your hugging face token here +hf_auth = "Add here" + +### Model to be chosen to act as an agent +model_id = "meta-llama/Llama-2-7b-chat-hf" + +### To check if there is GPU +has_cuda = torch.cuda.is_available() + +### To convert the model into 4bit quantization +bnb_config = None +if has_cuda: + try: + import bitsandbytes as bnb + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=torch.bfloat16, + ) + except Exception: + bnb_config = None + +### calling pre-trained tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True) + + +### calling pre-trained model +model = AutoModelForCausalLM.from_pretrained( + model_id, + token=hf_auth, + device_map="auto" if has_cuda else None, + torch_dtype=torch.bfloat16 if has_cuda else torch.float32, + quantization_config=bnb_config, + low_cpu_mem_usage=True, +) + +model.eval() + +### agent 1 +def agent1(a): + user_query = a + + prompt = f"You are a helpful assistant.\n\n{user_query}\n" + + inputs = tokenizer(prompt, return_tensors="pt") + + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=100, + do_sample=True, + temperature=0.3, + ) + + gen_tokens = outputs[0] + prompt_len = inputs["input_ids"].shape[1] + response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True) + + print("LLM A response:", response) + return response + +### agent 2 +def agent2(b): + user_query = b + + prompt = f"Just summarize what the agent1 said: \n\n{user_query}\n\n" + + inputs = tokenizer(prompt, return_tensors="pt") + + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + + with torch.no_grad(): + outputs = model.generate( + **inputs, + max_new_tokens=100, + do_sample=True, + temperature=0.3, + ) + + gen_tokens = outputs[0] + prompt_len = inputs["input_ids"].shape[1] + response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True) + print("LLM B response:", response) \ No newline at end of file From 473c81f2978a7465e780c9e914c4ca869655d173 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Tue, 16 Sep 2025 16:37:32 -0700 Subject: [PATCH 03/17] changed the file name of the file to be included in agent_llm.lf --- llm/src/agent_llm.lf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf index e5c7f90..35176fd 100644 --- a/llm/src/agent_llm.lf +++ b/llm/src/agent_llm.lf @@ -1,5 +1,5 @@ target Python{ - files: llm_textgeneration.py + files: llm.py }; preamble{= From 46522a14c9020089c117f67f39c5635c2e720bf0 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Fri, 19 Sep 2025 11:00:19 -0700 Subject: [PATCH 04/17] Added a quiz game. It is a game between two LLM models answering user questions and the model to respond the fastest wins --- llm/src/agent_llm.lf | 8 +- llm/src/llm.py | 114 +++++++++++----------- llm/src/llm_quiz_game.lf | 197 +++++++++++++++++++++++++++++++++++++++ 3 files changed, 257 insertions(+), 62 deletions(-) create mode 100644 llm/src/llm_quiz_game.lf diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf index 35176fd..5b5ab8e 100644 --- a/llm/src/agent_llm.lf +++ b/llm/src/agent_llm.lf @@ -7,7 +7,7 @@ preamble{= =} -reactor llm_a{ +reactor LLM_a{ output user_in reaction (startup)-> user_in{= @@ -16,7 +16,7 @@ reactor llm_a{ =} } -reactor llm_b{ +reactor LLM_b{ input llm_a_in output llm_b_out reaction (llm_a_in)-> llm_b_out{= @@ -26,8 +26,8 @@ reactor llm_b{ main reactor{ state response - user_response = new llm_a() - llm_response = new llm_b() + user_response = new LLM_a() + llm_response = new LLM_b() // call llm a to respond to user reaction (user_response.user_in)->llm_response.llm_a_in{= diff --git a/llm/src/llm.py b/llm/src/llm.py index 63b6234..93322f1 100644 --- a/llm/src/llm.py +++ b/llm/src/llm.py @@ -5,16 +5,19 @@ from torch import cuda, bfloat16 ### Add Your hugging face token here -hf_auth = "Add here" +hf_auth = "Add your token here" ### Model to be chosen to act as an agent model_id = "meta-llama/Llama-2-7b-chat-hf" +model_id_2 = "meta-llama/Llama-2-70b-chat-hf" -### To check if there is GPU +### To check if there is GPU and convert it into float 16 has_cuda = torch.cuda.is_available() +dtype = torch.bfloat16 if has_cuda else torch.float32 ### To convert the model into 4bit quantization bnb_config = None +### if there is cuda then the model is converted to 4bit quantization if has_cuda: try: import bitsandbytes as bnb @@ -22,73 +25,68 @@ load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, - bnb_4bit_compute_dtype=torch.bfloat16, + bnb_4bit_compute_dtype=dtype, ) except Exception: bnb_config = None ### calling pre-trained tokenizer -tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True) - - -### calling pre-trained model -model = AutoModelForCausalLM.from_pretrained( - model_id, - token=hf_auth, +tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True) +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, token=hf_auth, use_fast=True) +for tok in (tokenizer, tokenizer_2): + if tok.pad_token_id is None: + tok.pad_token = tok.eos_token + +### since both the models have same device map and using 4bit quantization for both +common = dict( device_map="auto" if has_cuda else None, - torch_dtype=torch.bfloat16 if has_cuda else torch.float32, - quantization_config=bnb_config, + dtype=dtype, low_cpu_mem_usage=True, ) +if bnb_config is not None: + common["quantization_config"] = bnb_config -model.eval() - -### agent 1 -def agent1(a): - user_query = a - - prompt = f"You are a helpful assistant.\n\n{user_query}\n" - +### calling pre-trained model +model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_auth, **common) +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, token=hf_auth, **common) +model.eval(); model_2.eval() + + + +### arguments for both the models +GEN_A = dict(max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id) +GEN_B = dict(max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id) + +###to resturn only one line answers +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +###Calling agent1 from .lf code +def agent1(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" inputs = tokenizer(prompt, return_tensors="pt") - - if has_cuda: - inputs = {k: v.to("cuda") for k, v in inputs.items()} - + if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.no_grad(): - outputs = model.generate( - **inputs, - max_new_tokens=100, - do_sample=True, - temperature=0.3, - ) - - gen_tokens = outputs[0] + out = model.generate(**inputs, **GEN_A) prompt_len = inputs["input_ids"].shape[1] - response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True) - - print("LLM A response:", response) - return response - -### agent 2 -def agent2(b): - user_query = b - - prompt = f"Just summarize what the agent1 said: \n\n{user_query}\n\n" - - inputs = tokenizer(prompt, return_tensors="pt") - - if has_cuda: - inputs = {k: v.to("cuda") for k, v in inputs.items()} - + result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True) + return postprocess(result) + +###Calling agent2 from .lf code +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()} with torch.no_grad(): - outputs = model.generate( - **inputs, - max_new_tokens=100, - do_sample=True, - temperature=0.3, - ) - - gen_tokens = outputs[0] + out = model_2.generate(**inputs, **GEN_B) prompt_len = inputs["input_ids"].shape[1] - response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True) - print("LLM B response:", response) \ No newline at end of file + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf new file mode 100644 index 0000000..85e89a4 --- /dev/null +++ b/llm/src/llm_quiz_game.lf @@ -0,0 +1,197 @@ +### llm.py file needs to be in the same directory +target Python { keepalive: true, files: ["llm.py"] } + +preamble {= + import threading + import time + from llm import agent1, agent2 # your Python functions + + def keyboard_prompt(reactor, action): + while True: + time.sleep(5) + action.schedule(None) +=} + +### Reactor for handling user keyboard input +reactor KeyboardInput { + state th + state terminate = False + state eof = False + state buffer = "" + + physical action line + output prompt + output quit + + reaction(startup) -> line {= + def reader(): + while not self.terminate: + + s = input("Enter the quiz question\n") + if s == "": + self.eof = True + line.schedule(0) + break + elif s.lower().strip() == "quit": + self.eof = True + line.schedule(0) + break + else: + self.buffer = s + line.schedule(1) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(line) -> prompt, quit {= + if self.eof: + quit.set() + environment().sync_shutdown() + else: + prompt.set(self.buffer) + =} + + reaction(shutdown) {= + self.terminate = True + if self.th and self.th.is_alive(): + self.th.join() + =} +} + +### Reactor for calling agent 1 +reactor LlmA { + state th + state running = False + state out_buffer = "" + + input user_in + physical action done + output answer + + + reaction(user_in) -> done {= + if self.running: + return + self.running = True + query = user_in.value + def agentA(): + try: + self.out_buffer = agent1(query) + finally: + done.schedule(1) + self.th = threading.Thread(target=agentA, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} + + +### Reactor for calling agent 2 +reactor LlmB { + state th + state running = False + state out_buffer = "" + input user_in + output answer + + physical action done + + reaction(user_in)->done{= + if self.running: + return + self.running = True + query = user_in.value + def agentB(): + try: + self.out_buffer = agent2(query) + finally: + done.schedule(1) + self.th = threading.Thread(target=agentB, daemon=True) + self.th.start() + =} + + reaction(done)->answer{= + self.running = False + answer.set(self.out_buffer) + =} + +} + +###Judge reactor to determine which agent responds first +reactor Judge{ + input query + input llma + input llmb + output ask + + state waiting = False + state logical_base_time = 0 + state physical_base_time = 0 + state winner = "" + + logical action timeout(60 sec) + + reaction(query) -> timeout, ask {= + self.waiting = True + self.winner = "" + self.logical_base_time = lf.time.logical_elapsed() + self.physical_base_time = lf.time.physical_elapsed() + timeout.schedule(0) + print(f"\n\n\nQuery: {query.value}\n") + print("waiting...\n") + ask.set(query.value) + =} + + reaction(llma) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") + print(f"{llma.value}") + =} + + reaction(llmb) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") + print(f"{llmb.value}") + =} + + reaction(timeout) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") + =} +} + + +main reactor { + llma_response = new LlmA() + llmb_response = new LlmB() + keyboard = new KeyboardInput() + j = new Judge() + + keyboard.prompt -> j.query + j.ask -> llma_response.user_in + j.ask -> llmb_response.user_in + llma_response.answer -> j.llma + llmb_response.answer -> j.llmb +} \ No newline at end of file From 9d9ee262ac2adcc677d09d4ecac9e27669c2e864 Mon Sep 17 00:00:00 2001 From: Deeksha Prahlad <112724341+Deeksha-20-99@users.noreply.github.com> Date: Fri, 19 Sep 2025 11:26:33 -0700 Subject: [PATCH 05/17] Updated the README.md for instructions to run the quiz game --- llm/README.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 93 insertions(+) diff --git a/llm/README.md b/llm/README.md index 3b7b658..c7b4000 100644 --- a/llm/README.md +++ b/llm/README.md @@ -1,2 +1,95 @@ # LLM Demo +# Overview +This is a quiz-style game between two LLM agents. For each user question typed at the keyboard, both agents answer in parallel. The Judge announces whichever answer arrives first (or a timeout if neither responds within 60 sec), and prints per-question elapsed logical and physical times. + +# Pre-requisites + +You need Python installed, as llm.py is written in Python. + +## Library Dependencies +To run this project, the following dependencies are required. The model used in this repository has been quantized using 4-bit precision (bnb_4bit) and relies on bitsandbytes for efficient matrix operations and memory optimization. So specific versions of bitsandbytes, torch, and torchvision are mandatory for compatibility. +While newer versions of other dependencies may work, the specific versions listed below have been tested and are recommended for optimal performance. + +It is highly recommended to create a Python virtual environment or a Conda environment to manage dependencies. The available options for environment setup are listed below. + +``` +pip install accelerate +pip install transformers +pip install tokenizers +pip install bitsandbytes>=0.43.0 +pip install torch +pip install torchvision +``` + +## System Requirements + +To ensure optimal performance, the following hardware and software requirements are utilized. \ +**Note:** To replicate this model, you can use any equivalent hardware that meets the computational requirements. + +### Hardware Requirements +- **GPU**: NVIDIA RTX A6000 + +### Software Requirements +- **Python** (Ensure Python is installed) +- **CUDA Version**: 12.8 +- **NVIDIA-SMI**: For monitoring GPU performance and memory utilization + +### Model Dependencies +- **Pre-trained Models**: [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf) [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) +**Note:** Please access and use the pre-trained models, authentication keys must be obtained from the [Hugging Face repository](https://huggingface.co/settings/tokens). Ensure you have a valid API token and configure authentication. + +Make sure the environment is properly configured to use CUDA for optimal GPU acceleration. + +# Files and directories in this repository + - **`llm.py`** - Contains the logic to load and call LLM models from the Hugging Face pretrained hub. + - **`llm_quiz_game.lf`** - Lingua Franca program that defines the quiz game reactors (Keyboard input, LLM agents, and Judge). + +# Execution Workflow + +### Step 1: +Run the **`llm_quiz_game.lf`**. + +**Note:** +- Ensure that you specify the correct file paths + +Run the following commands: + +``` +lfc src/llm_quiz_game.lf +``` + +### Step 2: Run the binary file and input the quiz question +Run the following commands: + +``` +./bin/llm_quiz_game +``` + +The system will ask for entering the quiz question which is to be obtained from the keyboard input. + +Example output printed on the terminal: + +
+
+--------------------------------------------------
+---- System clock resolution: 1 nsec
+---- Start execution on Fri Sep 19 10:46:31 2025 ---- plus 772215861 nanoseconds
+Enter the quiz question
+What is the capital of South Korea?
+Query: What is the capital of South Korea?
+
+waiting...
+
+Winner: LLM-B | logical 1184 ms | physical 1184 ms
+Answer: Seoul.
+--------------------------------------------------
+
+
+ +### Step 3: Monitoring GPU Performance (Optional) +In another terminal, monitor GPU performance and memory utilization while running the scripts, please use NVIDIA-SMI: +``` +nvidia-smi +``` +# Contributors From fe1f6054081268e9272d552c75342aaf9a8de9e1 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Fri, 19 Sep 2025 12:09:07 -0700 Subject: [PATCH 06/17] Removing the older version of the file agent_llm.lf --- llm/src/agent_llm.lf | 44 -------------------------------------------- 1 file changed, 44 deletions(-) delete mode 100644 llm/src/agent_llm.lf diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf deleted file mode 100644 index 5b5ab8e..0000000 --- a/llm/src/agent_llm.lf +++ /dev/null @@ -1,44 +0,0 @@ -target Python{ - files: llm.py -}; - -preamble{= - from llm import agent1, agent2 - -=} - -reactor LLM_a{ - - output user_in - reaction (startup)-> user_in{= - txt = input("Hey there!") - user_in.set(txt) - =} -} - -reactor LLM_b{ - input llm_a_in - output llm_b_out - reaction (llm_a_in)-> llm_b_out{= - llm_b_out.set(llm_a_in.value) - =} -} - -main reactor{ - state response - user_response = new LLM_a() - llm_response = new LLM_b() - // call llm a to respond to user - reaction (user_response.user_in)->llm_response.llm_a_in{= - - response = agent1(user_response.user_in.value) - llm_response.llm_a_in.set(response) - =} - - //llm b to respond to what llm a generated - reaction (llm_response.llm_b_out){= - # llm_response.llm_a_in = response - agent2(llm_response.llm_b_out.value) - =} - -} \ No newline at end of file From b0206643f229046eb82e3b59249a0ff493e3efef Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Mon, 22 Sep 2025 12:16:07 -0700 Subject: [PATCH 07/17] Modified comments to the program --- llm/src/llm_quiz_game.lf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf index 85e89a4..cd9584a 100644 --- a/llm/src/llm_quiz_game.lf +++ b/llm/src/llm_quiz_game.lf @@ -4,7 +4,7 @@ target Python { keepalive: true, files: ["llm.py"] } preamble {= import threading import time - from llm import agent1, agent2 # your Python functions + from llm import agent1, agent2 def keyboard_prompt(reactor, action): while True: From cc0a08a5e504e562e4faeccbbfeef829627d25dd Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Tue, 23 Sep 2025 16:28:33 -0700 Subject: [PATCH 08/17] created the files for quiz game between two llm models using main reactor and also added a federated execution --- llm/src/llm_a.py | 77 +++++++ llm/src/llm_b.py | 78 +++++++ llm/src/llm_base_class.lf | 176 ++++++++++++++ llm/src/llm_base_class_federate.lf | 354 ++++++++++++++++++++++++++++ llm/src/llm_game_federated.lf | 40 ++++ llm/src/llm_quiz_game.lf | 359 +++++++++++++++-------------- 6 files changed, 907 insertions(+), 177 deletions(-) create mode 100644 llm/src/llm_a.py create mode 100644 llm/src/llm_b.py create mode 100644 llm/src/llm_base_class.lf create mode 100644 llm/src/llm_base_class_federate.lf create mode 100644 llm/src/llm_game_federated.lf diff --git a/llm/src/llm_a.py b/llm/src/llm_a.py new file mode 100644 index 0000000..df5faf3 --- /dev/null +++ b/llm/src/llm_a.py @@ -0,0 +1,77 @@ +# llm_a.py — Agent 1 (7B) + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +# <<< put your token here >>> +hf_auth = "add token here " + +# Model to be chosen to act as an agent +model_id = "meta-llama/Llama-2-7b-chat-hf" + +# Require GPU (you said it must work only on GPU) +has_cuda = torch.cuda.is_available() +if not has_cuda: + raise RuntimeError("CUDA GPU required for this configuration.") +dtype = torch.bfloat16 if has_cuda else torch.float32 + +# 4-bit quantization +bnb_config = None +if has_cuda: + try: + import bitsandbytes as bnb # noqa: F401 + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=dtype, + ) + except Exception: + bnb_config = None + +# Tokenizer +tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True) +if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + +# Shared kwargs +common = dict( + device_map="auto" if has_cuda else None, + dtype=dtype, + low_cpu_mem_usage=True, +) +if bnb_config is not None: + common["quantization_config"] = bnb_config + +# Model +model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_auth, **common) +model.eval() + +# Generation args +GEN_A = dict( + max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id +) + +# One-line postprocess +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +# Agent 1 entrypoint +def agent1(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer(prompt, return_tensors="pt") + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model.generate(**inputs, **GEN_A) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True) + print(result) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/llm_b.py b/llm/src/llm_b.py new file mode 100644 index 0000000..513d6c2 --- /dev/null +++ b/llm/src/llm_b.py @@ -0,0 +1,78 @@ + +# llm_b.py — Agent 2 (70B) + +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +# <<< put your token here >>> +hf_auth = "add token here" + +# Model to be chosen to act as an agent +model_id_2 = "meta-llama/Llama-2-70b-chat-hf" + +# Require GPU (GPU-only) +has_cuda = torch.cuda.is_available() +if not has_cuda: + raise RuntimeError("CUDA GPU required for this configuration.") +dtype = torch.bfloat16 if has_cuda else torch.float32 + +# 4-bit quantization +bnb_config = None +if has_cuda: + try: + import bitsandbytes as bnb # noqa: F401 + bnb_config = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=dtype, + ) + except Exception: + bnb_config = None + +# Tokenizer +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, token=hf_auth, use_fast=True) +if tokenizer_2.pad_token_id is None: + tokenizer_2.pad_token = tokenizer_2.eos_token + +# Shared kwargs +common = dict( + device_map="auto" if has_cuda else None, + dtype=dtype, + low_cpu_mem_usage=True, +) +if bnb_config is not None: + common["quantization_config"] = bnb_config + +# Model +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, token=hf_auth, **common) +model_2.eval() + +# Generation args +GEN_B = dict( + max_new_tokens=24, do_sample=False, temperature=0.1, + eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id +) + +# One-line postprocess +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + idx = t.find(sep) + if idx > 0: + t = t[:idx] + break + return t.strip().strip(":").strip() + +# Agent 2 entrypoint +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + with torch.no_grad(): + out = model_2.generate(**inputs, **GEN_B) + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + print(result) + return postprocess(result) \ No newline at end of file diff --git a/llm/src/llm_base_class.lf b/llm/src/llm_base_class.lf new file mode 100644 index 0000000..d1eae4e --- /dev/null +++ b/llm/src/llm_base_class.lf @@ -0,0 +1,176 @@ +target Python + +### Reactor for handling user keyboard input +reactor KeyboardInput { + state th + state terminate = False + state eof = False + state buffer = "" + + physical action line + output prompt + output quit + + reaction(startup) -> line {= + def reader(): + while not self.terminate: + + s = input("Enter the quiz question\n") + if s == "": + self.eof = True + line.schedule(0) + break + elif s.lower().strip() == "quit": + self.eof = True + line.schedule(0) + break + else: + self.buffer = s + line.schedule(1) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(line) -> prompt, quit {= + if self.eof: + quit.set() + environment().sync_shutdown() + else: + prompt.set(self.buffer) + =} + + reaction(shutdown) {= + self.terminate = True + if self.th and self.th.is_alive(): + self.th.join() + =} +} + + + +### Reactor for calling agent 1 +reactor LlmA { + state th + state running = False + state out_buffer = "" + + input user_in + physical action done + output answer + + + reaction(user_in) -> done {= + if self.running: + return + self.running = True + query = user_in.value + def agentA(): + try: + self.out_buffer = agent1(query) + finally: + done.schedule(1) + self.th = threading.Thread(target=agentA, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} + + + +// ### Reactor for calling agent 2 +reactor LlmB { + state th + state running = False + state out_buffer = "" + input user_in + output answer + + physical action done + + reaction(user_in)->done{= + if self.running: + return + self.running = True + query = user_in.value + def agentB(): + try: + self.out_buffer = agent2(query) + finally: + done.schedule(1) + self.th = threading.Thread(target=agentB, daemon=True) + self.th.start() + =} + + reaction(done)->answer{= + self.running = False + answer.set(self.out_buffer) + =} + +} + + + +// ###Judge reactor to determine which agent responds first +reactor Judge{ + input query + input llma + input llmb + output ask + + state waiting = False + state logical_base_time = 0 + state physical_base_time = 0 + state winner = "" + + logical action timeout(60 sec) + + reaction(query) -> timeout, ask {= + self.waiting = True + self.winner = "" + self.logical_base_time = lf.time.logical_elapsed() + self.physical_base_time = lf.time.physical_elapsed() + timeout.schedule(0) + print(f"\n\n\nQuery: {query.value}\n") + print("waiting...\n") + ask.set(query.value) + =} + + reaction(llma) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") + print(f"{llma.value}") + =} + + reaction(llmb) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") + print(f"{llmb.value}") + =} + + reaction(timeout) {= + if not self.waiting: + return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") + =} +} diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf new file mode 100644 index 0000000..c5638d8 --- /dev/null +++ b/llm/src/llm_base_class_federate.lf @@ -0,0 +1,354 @@ +target Python + +### Reactor for handling user keyboard input + +// reactor KeyboardInput { +// state th +// state terminate = False +// state eof = False +// state buffer = "" + +// physical action line +// output prompt +// output quit + +// reaction(startup) -> line {= +// import sys +// import threading +// import time + +// def reader(): +// while not self.terminate: +// s = input("Enter the quiz question\n") +// if s == "": +// self.eof = True +// try: line.schedule(0) +// except Exception as e: print("[keyboard] schedule EOF failed:", e, flush=True) +// break +// elif s.lower().strip() == "quit": +// self.eof = True +// try: line.schedule(0) +// except Exception as e: print("[keyboard] schedule quit failed:", e, flush=True) +// break +// else: +// self.buffer = s +// try: line.schedule(1) # small logical hop +// except Exception as e: +// print("[keyboard] schedule failed:", e, flush=True) +// break +// self.th = threading.Thread(target=reader, daemon=True) +// self.th.start() +// =} + +// reaction(line) -> prompt, quit {= +// if self.eof: +// quit.set() +// environment().sync_shutdown() +// else: +// prompt.set(self.buffer) +// =} + +// reaction(shutdown) {= +// self.terminate = True +// if self.th and self.th.is_alive(): +// self.th.join() +// =} +// } + +### Reactor for calling agent 1 +reactor LlmA { + state th + state running = False + state out_buffer = "" + state ready = False + + input user_in + physical action done + physical action notify_ready + output answer + output ready_out + + reaction(startup) {= + import os, sys, importlib.util, threading + def _load(): + try: + here = os.path.dirname(__file__) + if here not in sys.path: sys.path.insert(0, here) + from llm_a import agent1 + notify_ready.schedule(0) + except Exception as e: + print("[LlmA] Preload failed:", e, flush=True) + threading.Thread(target=_load, daemon=True).start() + =} + + reaction(notify_ready) -> ready_out {= + self.ready = True + ready_out.set(True) + =} + + reaction(user_in) -> done {= + import threading + if not self.ready: return + if self.running: return + self.running = True + q = user_in.value + from llm_a import agent1 + def agentA(): + try: + self.out_buffer = agent1(q) + finally: + try: done.schedule(5) + except Exception as e: print("[LlmA] schedule failed:", e, flush=True) + self.th = threading.Thread(target=agentA, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} + +// ### Reactor for calling agent 2 +reactor LlmB { + state th + state running = False + state out_buffer = "" + state ready = False + + input user_in + physical action done + physical action notify_ready + output answer + output ready_out + + reaction(startup) {= + import os, sys, importlib.util, threading + def _load(): + try: + here = os.path.dirname(__file__) + if here not in sys.path: sys.path.insert(0, here) + from llm_b import agent2 + notify_ready.schedule(0) + except Exception as e: + print("[LlmB] Preload failed:", e, flush=True) + threading.Thread(target=_load, daemon=True).start() + =} + + reaction(notify_ready) -> ready_out {= + self.ready = True + ready_out.set(True) + =} + + reaction(user_in) -> done {= + import threading + if not self.ready: return + if self.running: return + self.running = True + q = user_in.value + from llm_b import agent2 + def agentB(): + try: + self.out_buffer = agent2(q) + finally: + try: done.schedule(5) + except Exception as e: print("[LlmB] schedule failed:", e, flush=True) + self.th = threading.Thread(target=agentB, daemon=True) + self.th.start() + =} + + reaction(done) -> answer {= + self.running = False + answer.set(self.out_buffer) + =} +} +// ###Judge reactor to determine which agent responds first +// reactor Judge{ +// input query +// input llma +// input llmb +// output ask + +// state waiting = False +// state logical_base_time = 0 +// state physical_base_time = 0 +// state winner = "" + +// logical action timeout(60 sec) + +// reaction(query) -> timeout, ask {= +// self.waiting = True +// self.winner = "" +// self.logical_base_time = lf.time.logical_elapsed() +// self.physical_base_time = lf.time.physical_elapsed() +// timeout.schedule(0) +// print(f"\n\n\nQuery: {query.value}\n") +// print("waiting...\n") +// ask.set(query.value) +// =} + +// reaction(llma) {= +// if not self.waiting: +// return +// self.waiting = False +// logical_now = lf.time.logical_elapsed() +// physical_now = lf.time.physical_elapsed() +// logical_ms = int((logical_now - self.logical_base_time) / 1000000) +// physical_ms = int((physical_now - self.physical_base_time) / 1000000) +// print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") +// print(f"{llma.value}") +// =} + +// reaction(llmb) {= +// if not self.waiting: +// return +// self.waiting = False +// logical_now = lf.time.logical_elapsed() +// physical_now = lf.time.physical_elapsed() +// logical_ms = int((logical_now - self.logical_base_time) / 1000000) +// physical_ms = int((physical_now - self.physical_base_time) / 1000000) +// print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") +// print(f"{llmb.value}") +// =} + +// reaction(timeout) {= +// if not self.waiting: +// return +// self.waiting = False +// logical_now = lf.time.logical_elapsed() +// physical_now = lf.time.physical_elapsed() +// logical_ms = int((logical_now - self.logical_base_time) / 1000000) +// physical_ms = int((physical_now - self.physical_base_time) / 1000000) +// print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") +// =} +// } + +reactor Judge { + state th + state reader_started = False + state terminate = False + state eof = False + state buffer = "" + state waiting = False + state logical_base_time = 0 + state physical_base_time = 0 + input ready_a + input ready_b + state a_ready = False + state b_ready = False + physical action line + physical action tick + logical action timeout(60 sec) + output ask + input llma + input llmb + output quit + + reaction(startup) {= + print("[Judge] Waiting for models to load...", flush=True) + =} + + reaction(ready_a) {= + self.a_ready = True + if self.a_ready and self.b_ready and not self.reader_started: + import sys, threading + def reader(): + while not self.terminate: + s = input("Enter the quiz question (or 'quit')\n") + if s == "" or s.lower().strip() == "quit": + self.eof = True + try: line.schedule(0) + except Exception as e: print("[Judge] schedule EOF failed:", e, flush=True) + break + else: + self.buffer = s + try: line.schedule(1) + except Exception as e: + print("[Judge] schedule line failed:", e, flush=True) + break + self.reader_started = True + print("[Judge] Models ready. You can ask questions now.", flush=True) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(ready_b) {= + self.b_ready = True + if self.a_ready and self.b_ready and not self.reader_started: + import sys, threading + def reader(): + while not self.terminate: + s = input("Enter the quiz question (or 'quit')\n") + if s == "" or s.lower().strip() == "quit": + self.eof = True + try: line.schedule(0) + except Exception as e: print("[Judge] schedule EOF failed:", e, flush=True) + break + else: + self.buffer = s + try: line.schedule(1) + except Exception as e: + print("[Judge] schedule line failed:", e, flush=True) + break + self.reader_started = True + print("[Judge] Models ready. You can ask questions now.", flush=True) + self.th = threading.Thread(target=reader, daemon=True) + self.th.start() + =} + + reaction(line) -> tick, ask, timeout, quit {= + if self.eof: + quit.set() + environment().sync_shutdown() + else: + self.waiting = True + self.logical_base_time = lf.time.logical_elapsed() + self.physical_base_time = lf.time.physical_elapsed() + timeout.schedule(0) + print(f"\n\n\nQuery: {self.buffer}\n", flush=True) + print("waiting...\n", flush=True) + tick.schedule(5) + =} + + reaction(tick) -> ask {= + ask.set(self.buffer) + =} + + reaction(llma) {= + if not self.waiting: return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms", flush=True) + print(f"{llma.value}", flush=True) + =} + + reaction(llmb) {= + if not self.waiting: return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms", flush=True) + print(f"{llmb.value}", flush=True) + =} + + reaction(timeout) {= + if not self.waiting: return + self.waiting = False + logical_now = lf.time.logical_elapsed() + physical_now = lf.time.physical_elapsed() + logical_ms = int((logical_now - self.logical_base_time) / 1000000) + physical_ms = int((physical_now - self.physical_base_time) / 1000000) + print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms", flush=True) + =} + + reaction(shutdown) {= + self.terminate = True + if self.th and self.th.is_alive(): + self.th.join() + =} +} \ No newline at end of file diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf new file mode 100644 index 0000000..0a4dcd0 --- /dev/null +++ b/llm/src/llm_game_federated.lf @@ -0,0 +1,40 @@ +### llm.py file needs to be in the same directory +target Python { keepalive: true, files: ["llm_a.py", "llm_b.py"] } +// import KeyboardInput from "llm_base_class_federate.lf" +import LlmA from "llm_base_class_federate.lf" +import LlmB from "llm_base_class_federate.lf" +import Judge from "llm_base_class_federate.lf" + +preamble {= + import threading + import time + from llm_a import agent1 + from llm_b import agent2 +=} + + +federated reactor llm_game_federated at 10.218.100.95 { + // llma_response_f = new LlmA() + // llmb_response_f = new LlmB() + // keyboard_f = new KeyboardInput() + // j_f = new Judge() + + // keyboard_f.prompt -> j_f.query + // j_f.ask -> llma_response_f.user_in + // j_f.ask -> llmb_response_f.user_in + // llma_response_f.answer -> j_f.llma + // llmb_response_f.answer -> j_f.llmb + j = new Judge() + llma = new LlmA() + llmb = new LlmB() + + j.ask -> llma.user_in + j.ask -> llmb.user_in + llma.answer -> j.llma + llmb.answer -> j.llmb + + llma.ready_out -> j.ready_a + llmb.ready_out -> j.ready_b + +} + diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf index cd9584a..7ba9d6b 100644 --- a/llm/src/llm_quiz_game.lf +++ b/llm/src/llm_quiz_game.lf @@ -1,188 +1,17 @@ ### llm.py file needs to be in the same directory target Python { keepalive: true, files: ["llm.py"] } +import KeyboardInput from "llm_base_class.lf" +import LlmA from "llm_base_class.lf" +import LlmB from "llm_base_class.lf" +import Judge from "llm_base_class.lf" + preamble {= import threading import time from llm import agent1, agent2 - - def keyboard_prompt(reactor, action): - while True: - time.sleep(5) - action.schedule(None) =} -### Reactor for handling user keyboard input -reactor KeyboardInput { - state th - state terminate = False - state eof = False - state buffer = "" - - physical action line - output prompt - output quit - - reaction(startup) -> line {= - def reader(): - while not self.terminate: - - s = input("Enter the quiz question\n") - if s == "": - self.eof = True - line.schedule(0) - break - elif s.lower().strip() == "quit": - self.eof = True - line.schedule(0) - break - else: - self.buffer = s - line.schedule(1) - self.th = threading.Thread(target=reader, daemon=True) - self.th.start() - =} - - reaction(line) -> prompt, quit {= - if self.eof: - quit.set() - environment().sync_shutdown() - else: - prompt.set(self.buffer) - =} - - reaction(shutdown) {= - self.terminate = True - if self.th and self.th.is_alive(): - self.th.join() - =} -} - -### Reactor for calling agent 1 -reactor LlmA { - state th - state running = False - state out_buffer = "" - - input user_in - physical action done - output answer - - - reaction(user_in) -> done {= - if self.running: - return - self.running = True - query = user_in.value - def agentA(): - try: - self.out_buffer = agent1(query) - finally: - done.schedule(1) - self.th = threading.Thread(target=agentA, daemon=True) - self.th.start() - =} - - reaction(done) -> answer {= - self.running = False - answer.set(self.out_buffer) - =} -} - - -### Reactor for calling agent 2 -reactor LlmB { - state th - state running = False - state out_buffer = "" - input user_in - output answer - - physical action done - - reaction(user_in)->done{= - if self.running: - return - self.running = True - query = user_in.value - def agentB(): - try: - self.out_buffer = agent2(query) - finally: - done.schedule(1) - self.th = threading.Thread(target=agentB, daemon=True) - self.th.start() - =} - - reaction(done)->answer{= - self.running = False - answer.set(self.out_buffer) - =} - -} - -###Judge reactor to determine which agent responds first -reactor Judge{ - input query - input llma - input llmb - output ask - - state waiting = False - state logical_base_time = 0 - state physical_base_time = 0 - state winner = "" - - logical action timeout(60 sec) - - reaction(query) -> timeout, ask {= - self.waiting = True - self.winner = "" - self.logical_base_time = lf.time.logical_elapsed() - self.physical_base_time = lf.time.physical_elapsed() - timeout.schedule(0) - print(f"\n\n\nQuery: {query.value}\n") - print("waiting...\n") - ask.set(query.value) - =} - - reaction(llma) {= - if not self.waiting: - return - self.waiting = False - logical_now = lf.time.logical_elapsed() - physical_now = lf.time.physical_elapsed() - logical_ms = int((logical_now - self.logical_base_time) / 1000000) - physical_ms = int((physical_now - self.physical_base_time) / 1000000) - print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") - print(f"{llma.value}") - =} - - reaction(llmb) {= - if not self.waiting: - return - self.waiting = False - logical_now = lf.time.logical_elapsed() - physical_now = lf.time.physical_elapsed() - logical_ms = int((logical_now - self.logical_base_time) / 1000000) - physical_ms = int((physical_now - self.physical_base_time) / 1000000) - print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") - print(f"{llmb.value}") - =} - - reaction(timeout) {= - if not self.waiting: - return - self.waiting = False - logical_now = lf.time.logical_elapsed() - physical_now = lf.time.physical_elapsed() - logical_ms = int((logical_now - self.logical_base_time) / 1000000) - physical_ms = int((physical_now - self.physical_base_time) / 1000000) - print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") - =} -} - - main reactor { llma_response = new LlmA() llmb_response = new LlmB() @@ -194,4 +23,180 @@ main reactor { j.ask -> llmb_response.user_in llma_response.answer -> j.llma llmb_response.answer -> j.llmb -} \ No newline at end of file +} + + +// def keyboard_prompt(reactor, action): + // while True: + // time.sleep(5) + // action.schedule(None) + +// ### Reactor for handling user keyboard input +// reactor KeyboardInput { +// state th +// state terminate = False +// state eof = False +// state buffer = "" + +// physical action line +// output prompt +// output quit + +// reaction(startup) -> line {= +// def reader(): +// while not self.terminate: + +// s = input("Enter the quiz question\n") +// if s == "": +// self.eof = True +// line.schedule(0) +// break +// elif s.lower().strip() == "quit": +// self.eof = True +// line.schedule(0) +// break +// else: +// self.buffer = s +// line.schedule(1) +// self.th = threading.Thread(target=reader, daemon=True) +// self.th.start() +// =} + +// reaction(line) -> prompt, quit {= +// if self.eof: +// quit.set() +// environment().sync_shutdown() +// else: +// prompt.set(self.buffer) +// =} + +// reaction(shutdown) {= +// self.terminate = True +// if self.th and self.th.is_alive(): +// self.th.join() +// =} +// } + +// ### Reactor for calling agent 1 +// reactor LlmA { +// state th +// state running = False +// state out_buffer = "" + +// input user_in +// physical action done +// output answer + + +// reaction(user_in) -> done {= +// if self.running: +// return +// self.running = True +// query = user_in.value +// def agentA(): +// try: +// self.out_buffer = agent1(query) +// finally: +// done.schedule(1) +// self.th = threading.Thread(target=agentA, daemon=True) +// self.th.start() +// =} + +// reaction(done) -> answer {= +// self.running = False +// answer.set(self.out_buffer) +// =} +// } + + +// ### Reactor for calling agent 2 +// reactor LlmB { +// state th +// state running = False +// state out_buffer = "" +// input user_in +// output answer + +// physical action done + +// reaction(user_in)->done{= +// if self.running: +// return +// self.running = True +// query = user_in.value +// def agentB(): +// try: +// self.out_buffer = agent2(query) +// finally: +// done.schedule(1) +// self.th = threading.Thread(target=agentB, daemon=True) +// self.th.start() +// =} + +// reaction(done)->answer{= +// self.running = False +// answer.set(self.out_buffer) +// =} + +// } + +// ###Judge reactor to determine which agent responds first +// reactor Judge{ +// input query +// input llma +// input llmb +// output ask + +// state waiting = False +// state logical_base_time = 0 +// state physical_base_time = 0 +// state winner = "" + +// logical action timeout(60 sec) + +// reaction(query) -> timeout, ask {= +// self.waiting = True +// self.winner = "" +// self.logical_base_time = lf.time.logical_elapsed() +// self.physical_base_time = lf.time.physical_elapsed() +// timeout.schedule(0) +// print(f"\n\n\nQuery: {query.value}\n") +// print("waiting...\n") +// ask.set(query.value) +// =} + +// reaction(llma) {= +// if not self.waiting: +// return +// self.waiting = False +// logical_now = lf.time.logical_elapsed() +// physical_now = lf.time.physical_elapsed() +// logical_ms = int((logical_now - self.logical_base_time) / 1000000) +// physical_ms = int((physical_now - self.physical_base_time) / 1000000) +// print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") +// print(f"{llma.value}") +// =} + +// reaction(llmb) {= +// if not self.waiting: +// return +// self.waiting = False +// logical_now = lf.time.logical_elapsed() +// physical_now = lf.time.physical_elapsed() +// logical_ms = int((logical_now - self.logical_base_time) / 1000000) +// physical_ms = int((physical_now - self.physical_base_time) / 1000000) +// print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") +// print(f"{llmb.value}") +// =} + +// reaction(timeout) {= +// if not self.waiting: +// return +// self.waiting = False +// logical_now = lf.time.logical_elapsed() +// physical_now = lf.time.physical_elapsed() +// logical_ms = int((logical_now - self.logical_base_time) / 1000000) +// physical_ms = int((physical_now - self.physical_base_time) / 1000000) +// print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") +// =} +// } From 632dc8eda58f7fd244305353d6898896277733a6 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Tue, 23 Sep 2025 16:37:59 -0700 Subject: [PATCH 09/17] Adding the git ignore file --- .gitignore | 6 ++++++ 1 file changed, 6 insertions(+) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..eed972c --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +llm/fed-gen/ +llm/src-gen/ +llm/include/ +llm/bin +**__pycache__** +llm/=** \ No newline at end of file From 6c8117de13058b11e88771b69dc9bee887b8a335 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 25 Sep 2025 11:58:32 -0700 Subject: [PATCH 10/17] Fixed the issue for the judge federate to receive the signal that model is loaded --- llm/src/llm_base_class_federate.lf | 140 ++++------------------------- llm/src/llm_game_federated.lf | 7 +- 2 files changed, 19 insertions(+), 128 deletions(-) diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf index c5638d8..235b703 100644 --- a/llm/src/llm_base_class_federate.lf +++ b/llm/src/llm_base_class_federate.lf @@ -1,60 +1,5 @@ target Python -### Reactor for handling user keyboard input - -// reactor KeyboardInput { -// state th -// state terminate = False -// state eof = False -// state buffer = "" - -// physical action line -// output prompt -// output quit - -// reaction(startup) -> line {= -// import sys -// import threading -// import time - -// def reader(): -// while not self.terminate: -// s = input("Enter the quiz question\n") -// if s == "": -// self.eof = True -// try: line.schedule(0) -// except Exception as e: print("[keyboard] schedule EOF failed:", e, flush=True) -// break -// elif s.lower().strip() == "quit": -// self.eof = True -// try: line.schedule(0) -// except Exception as e: print("[keyboard] schedule quit failed:", e, flush=True) -// break -// else: -// self.buffer = s -// try: line.schedule(1) # small logical hop -// except Exception as e: -// print("[keyboard] schedule failed:", e, flush=True) -// break -// self.th = threading.Thread(target=reader, daemon=True) -// self.th.start() -// =} - -// reaction(line) -> prompt, quit {= -// if self.eof: -// quit.set() -// environment().sync_shutdown() -// else: -// prompt.set(self.buffer) -// =} - -// reaction(shutdown) {= -// self.terminate = True -// if self.th and self.th.is_alive(): -// self.th.join() -// =} -// } - ### Reactor for calling agent 1 reactor LlmA { state th @@ -64,20 +9,22 @@ reactor LlmA { input user_in physical action done - physical action notify_ready + physical action notify_ready output answer output ready_out - reaction(startup) {= - import os, sys, importlib.util, threading + reaction(startup) -> notify_ready {= + import os, sys, importlib.util, threading, traceback + act = notify_ready def _load(): try: here = os.path.dirname(__file__) if here not in sys.path: sys.path.insert(0, here) from llm_a import agent1 - notify_ready.schedule(0) + act.schedule(1) except Exception as e: print("[LlmA] Preload failed:", e, flush=True) + traceback.print_exc() threading.Thread(target=_load, daemon=True).start() =} @@ -118,20 +65,22 @@ reactor LlmB { input user_in physical action done - physical action notify_ready + physical action notify_ready output answer output ready_out - reaction(startup) {= - import os, sys, importlib.util, threading + reaction(startup) -> notify_ready {= + import os, sys, importlib.util, threading, traceback + act = notify_ready def _load(): try: here = os.path.dirname(__file__) if here not in sys.path: sys.path.insert(0, here) from llm_b import agent2 - notify_ready.schedule(0) + act.schedule(1) except Exception as e: print("[LlmB] Preload failed:", e, flush=True) + traceback.print_exc() threading.Thread(target=_load, daemon=True).start() =} @@ -163,65 +112,6 @@ reactor LlmB { =} } // ###Judge reactor to determine which agent responds first -// reactor Judge{ -// input query -// input llma -// input llmb -// output ask - -// state waiting = False -// state logical_base_time = 0 -// state physical_base_time = 0 -// state winner = "" - -// logical action timeout(60 sec) - -// reaction(query) -> timeout, ask {= -// self.waiting = True -// self.winner = "" -// self.logical_base_time = lf.time.logical_elapsed() -// self.physical_base_time = lf.time.physical_elapsed() -// timeout.schedule(0) -// print(f"\n\n\nQuery: {query.value}\n") -// print("waiting...\n") -// ask.set(query.value) -// =} - -// reaction(llma) {= -// if not self.waiting: -// return -// self.waiting = False -// logical_now = lf.time.logical_elapsed() -// physical_now = lf.time.physical_elapsed() -// logical_ms = int((logical_now - self.logical_base_time) / 1000000) -// physical_ms = int((physical_now - self.physical_base_time) / 1000000) -// print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms") -// print(f"{llma.value}") -// =} - -// reaction(llmb) {= -// if not self.waiting: -// return -// self.waiting = False -// logical_now = lf.time.logical_elapsed() -// physical_now = lf.time.physical_elapsed() -// logical_ms = int((logical_now - self.logical_base_time) / 1000000) -// physical_ms = int((physical_now - self.physical_base_time) / 1000000) -// print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms") -// print(f"{llmb.value}") -// =} - -// reaction(timeout) {= -// if not self.waiting: -// return -// self.waiting = False -// logical_now = lf.time.logical_elapsed() -// physical_now = lf.time.physical_elapsed() -// logical_ms = int((logical_now - self.logical_base_time) / 1000000) -// physical_ms = int((physical_now - self.physical_base_time) / 1000000) -// print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms") -// =} -// } reactor Judge { state th @@ -245,10 +135,10 @@ reactor Judge { output quit reaction(startup) {= - print("[Judge] Waiting for models to load...", flush=True) + print("[Judge] Waiting for models to load", flush=True) =} - reaction(ready_a) {= + reaction(ready_a)->line {= self.a_ready = True if self.a_ready and self.b_ready and not self.reader_started: import sys, threading @@ -272,7 +162,7 @@ reactor Judge { self.th.start() =} - reaction(ready_b) {= + reaction(ready_b)->line {= self.b_ready = True if self.a_ready and self.b_ready and not self.reader_started: import sys, threading diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf index 0a4dcd0..5111854 100644 --- a/llm/src/llm_game_federated.lf +++ b/llm/src/llm_game_federated.lf @@ -1,9 +1,10 @@ ### llm.py file needs to be in the same directory target Python { keepalive: true, files: ["llm_a.py", "llm_b.py"] } // import KeyboardInput from "llm_base_class_federate.lf" -import LlmA from "llm_base_class_federate.lf" -import LlmB from "llm_base_class_federate.lf" -import Judge from "llm_base_class_federate.lf" +// import LlmA from "llm_base_class_federate.lf" +// import LlmB from "llm_base_class_federate.lf" +// import Judge from "llm_base_class_federate.lf" +import LlmA, LlmB, Judge from "llm_base_class_federate.lf" preamble {= import threading From 2f1a884b43f59d6cc7af7819171a3a90b4452856 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 25 Sep 2025 16:09:09 -0700 Subject: [PATCH 11/17] Added the version of files for running on different devices --- llm/src/llm_a.py | 10 +-- llm/src/llm_b.py | 10 +-- llm/src/llm_b_m2.py | 102 +++++++++++++++++++++++++++++ llm/src/llm_base_class_federate.lf | 8 +-- llm/src/llm_game_federated.lf | 26 ++------ 5 files changed, 123 insertions(+), 33 deletions(-) create mode 100644 llm/src/llm_b_m2.py diff --git a/llm/src/llm_a.py b/llm/src/llm_a.py index df5faf3..15411cd 100644 --- a/llm/src/llm_a.py +++ b/llm/src/llm_a.py @@ -1,4 +1,4 @@ -# llm_a.py — Agent 1 (7B) +# llm_a.py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig @@ -6,10 +6,10 @@ # <<< put your token here >>> hf_auth = "add token here " -# Model to be chosen to act as an agent +# Model model_id = "meta-llama/Llama-2-7b-chat-hf" -# Require GPU (you said it must work only on GPU) +# Require GPU has_cuda = torch.cuda.is_available() if not has_cuda: raise RuntimeError("CUDA GPU required for this configuration.") @@ -19,7 +19,7 @@ bnb_config = None if has_cuda: try: - import bitsandbytes as bnb # noqa: F401 + import bitsandbytes as bnb bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", @@ -63,7 +63,7 @@ def postprocess(text: str) -> str: break return t.strip().strip(":").strip() -# Agent 1 entrypoint +# Agent 1 def agent1(q: str) -> str: prompt = f"You are a concise Q&A assistant.\n\n{q}\n" inputs = tokenizer(prompt, return_tensors="pt") diff --git a/llm/src/llm_b.py b/llm/src/llm_b.py index 513d6c2..6acb7d9 100644 --- a/llm/src/llm_b.py +++ b/llm/src/llm_b.py @@ -1,5 +1,5 @@ -# llm_b.py — Agent 2 (70B) +# llm_b.py import torch from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig @@ -7,10 +7,10 @@ # <<< put your token here >>> hf_auth = "add token here" -# Model to be chosen to act as an agent +# Model model_id_2 = "meta-llama/Llama-2-70b-chat-hf" -# Require GPU (GPU-only) +# Require GPU has_cuda = torch.cuda.is_available() if not has_cuda: raise RuntimeError("CUDA GPU required for this configuration.") @@ -20,7 +20,7 @@ bnb_config = None if has_cuda: try: - import bitsandbytes as bnb # noqa: F401 + import bitsandbytes as bnb bnb_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", @@ -64,7 +64,7 @@ def postprocess(text: str) -> str: break return t.strip().strip(":").strip() -# Agent 2 entrypoint +# Agent 2 def agent2(q: str) -> str: prompt = f"You are a concise Q&A assistant.\n\n{q}\n" inputs = tokenizer_2(prompt, return_tensors="pt") diff --git a/llm/src/llm_b_m2.py b/llm/src/llm_b_m2.py new file mode 100644 index 0000000..45bad45 --- /dev/null +++ b/llm/src/llm_b_m2.py @@ -0,0 +1,102 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig + +hf_auth = "add your token here" + +model_id_2 = "google/gemma-3-270m" + +has_cuda = torch.cuda.is_available() +has_mps = torch.backends.mps.is_available() + +if has_cuda: + device = torch.device("cuda") + compute_dtype = torch.float16 +elif has_mps: + device = torch.device("mps") + compute_dtype = torch.float32 +else: + device = torch.device("cpu") + compute_dtype = torch.float32 + + +common = dict( + low_cpu_mem_usage=True, + attn_implementation="eager", +) + +#4-bit on CUDA if the device has it +if has_cuda: + try: + import bitsandbytes as bnb + common["quantization_config"] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_quant_type="nf4", + bnb_4bit_use_double_quant=True, + bnb_4bit_compute_dtype=compute_dtype, + ) + common["device_map"] = "auto" + except Exception: + print("[WARN] bitsandbytes not available; using full-precision fp16 on CUDA.", flush=True) + common["device_map"] = "auto" +else: + common["device_map"] = None + +# Tokenizer +tok_kwargs = dict(use_fast=True) +if hf_auth: + tok_kwargs["token"] = hf_auth +tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, **tok_kwargs) +if tokenizer_2.pad_token_id is None: + tokenizer_2.pad_token = tokenizer_2.eos_token + +# Model +mp_kwargs = dict(dtype=compute_dtype, **common) +if hf_auth: + mp_kwargs["token"] = hf_auth + +model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, **mp_kwargs) +if not has_cuda: + model_2.to(device) +model_2.eval() + +# Greedy decoding +GEN_B = dict( + max_new_tokens=32, + do_sample=True, + eos_token_id=tokenizer_2.eos_token_id, + pad_token_id=tokenizer_2.pad_token_id, +) + +def postprocess(text: str) -> str: + t = text.strip() + for sep in ["\n", ". ", " "]: + i = t.find(sep) + if i > 0: + t = t[:i] + break + return t.strip().strip(":").strip() + +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer_2(prompt, return_tensors="pt") + + if has_cuda: + inputs = {k: v.to("cuda") for k, v in inputs.items()} + elif has_mps: + inputs = {k: v.to("mps") for k, v in inputs.items()} + else: + inputs = {k: v.to("cpu") for k, v in inputs.items()} + + with torch.inference_mode(): + out = model_2.generate(**inputs, **GEN_B) + + prompt_len = inputs["input_ids"].shape[1] + result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True) + print(result) + return postprocess(result) + +# def main(): +# agent2("what is AI?") + +# if __name__ == "__main__": +# main() \ No newline at end of file diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf index 235b703..194d49a 100644 --- a/llm/src/llm_base_class_federate.lf +++ b/llm/src/llm_base_class_federate.lf @@ -56,7 +56,7 @@ reactor LlmA { =} } -// ### Reactor for calling agent 2 +### Reactor for calling agent 2 reactor LlmB { state th state running = False @@ -76,7 +76,7 @@ reactor LlmB { try: here = os.path.dirname(__file__) if here not in sys.path: sys.path.insert(0, here) - from llm_b import agent2 + from llm_b_m2 import agent2 act.schedule(1) except Exception as e: print("[LlmB] Preload failed:", e, flush=True) @@ -95,7 +95,7 @@ reactor LlmB { if self.running: return self.running = True q = user_in.value - from llm_b import agent2 + from llm_b_m2 import agent2 def agentB(): try: self.out_buffer = agent2(q) @@ -111,7 +111,7 @@ reactor LlmB { answer.set(self.out_buffer) =} } -// ###Judge reactor to determine which agent responds first +###Judge reactor to determine which agent responds first reactor Judge { state th diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf index 5111854..d2b0834 100644 --- a/llm/src/llm_game_federated.lf +++ b/llm/src/llm_game_federated.lf @@ -1,33 +1,21 @@ ### llm.py file needs to be in the same directory -target Python { keepalive: true, files: ["llm_a.py", "llm_b.py"] } -// import KeyboardInput from "llm_base_class_federate.lf" -// import LlmA from "llm_base_class_federate.lf" -// import LlmB from "llm_base_class_federate.lf" -// import Judge from "llm_base_class_federate.lf" +target Python { keepalive: true, files: ["llm_a.py", "llm_b_m2.py" ] } #"llm_b.py" + import LlmA, LlmB, Judge from "llm_base_class_federate.lf" preamble {= import threading import time from llm_a import agent1 - from llm_b import agent2 + from llm_b_m2 import agent2 =} federated reactor llm_game_federated at 10.218.100.95 { - // llma_response_f = new LlmA() - // llmb_response_f = new LlmB() - // keyboard_f = new KeyboardInput() - // j_f = new Judge() - - // keyboard_f.prompt -> j_f.query - // j_f.ask -> llma_response_f.user_in - // j_f.ask -> llmb_response_f.user_in - // llma_response_f.answer -> j_f.llma - // llmb_response_f.answer -> j_f.llmb - j = new Judge() - llma = new LlmA() - llmb = new LlmB() + + j = new Judge() + llma = new LlmA() + llmb = new LlmB() j.ask -> llma.user_in j.ask -> llmb.user_in From 1958fbb8aceeff3634d074a8ba8d3452e6e4337d Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 9 Oct 2025 11:01:23 -0700 Subject: [PATCH 12/17] Adding a python script for llama 3.2 1B for jetson orin --- llm/src/llm_b_jetson.py | 52 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 llm/src/llm_b_jetson.py diff --git a/llm/src/llm_b_jetson.py b/llm/src/llm_b_jetson.py new file mode 100644 index 0000000..7dc94fa --- /dev/null +++ b/llm/src/llm_b_jetson.py @@ -0,0 +1,52 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer + +hf_auth = "" + +model_id = "meta-llama/Llama-3.2-1B" + +has_cuda = torch.cuda.is_available() +device = torch.device("cuda" if has_cuda else "cpu") +compute_dtype = torch.float16 if has_cuda else torch.float32 + +common = dict( + low_cpu_mem_usage=True, + attn_implementation="eager", +) + +tok_kwargs = dict(use_fast=True) +if hf_auth: + tok_kwargs["token"] = hf_auth + +tokenizer = AutoTokenizer.from_pretrained(model_id, **tok_kwargs) +if tokenizer.pad_token_id is None: + tokenizer.pad_token = tokenizer.eos_token + +mp_kwargs = dict(torch_dtype=compute_dtype, **common) +if hf_auth: + mp_kwargs["token"] = hf_auth + +model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs) +model.to(device) +model.eval() + +GEN = dict( + max_new_tokens=64, + do_sample=True, + temperature=0.7, + top_p=0.95, + eos_token_id=tokenizer.eos_token_id, + pad_token_id=tokenizer.pad_token_id, +) + +def agent2(q: str) -> str: + prompt = f"You are a concise Q&A assistant.\n\n{q}\n" + inputs = tokenizer(prompt, return_tensors="pt").to(device) + with torch.inference_mode(): + out = model.generate(**inputs, **GEN) + gen = out[0, inputs["input_ids"].shape[1]:] + return tokenizer.decode(gen, skip_special_tokens=True).strip() + +if __name__ == "__main__": + question = "What is the capital of Japan?" + print(agent2(question)) From 60f642d11f09fc6f49a9c3fce21401f743f07674 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 9 Oct 2025 12:27:06 -0700 Subject: [PATCH 13/17] commented the code for testing --- llm/src/llm_b_jetson.py | 87 ++++++++++++++++++++++------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/llm/src/llm_b_jetson.py b/llm/src/llm_b_jetson.py index 7dc94fa..40461ed 100644 --- a/llm/src/llm_b_jetson.py +++ b/llm/src/llm_b_jetson.py @@ -1,52 +1,57 @@ -import torch -from transformers import AutoModelForCausalLM, AutoTokenizer +# import torch +# from transformers import AutoModelForCausalLM, AutoTokenizer -hf_auth = "" +# hf_auth = "" -model_id = "meta-llama/Llama-3.2-1B" +# model_id = "meta-llama/Llama-3.2-1B" -has_cuda = torch.cuda.is_available() -device = torch.device("cuda" if has_cuda else "cpu") -compute_dtype = torch.float16 if has_cuda else torch.float32 +# has_cuda = torch.cuda.is_available() +# device = torch.device("cuda" if has_cuda else "cpu") +# compute_dtype = torch.float16 if has_cuda else torch.float32 -common = dict( - low_cpu_mem_usage=True, - attn_implementation="eager", -) +# common = dict( +# low_cpu_mem_usage=True, +# attn_implementation="eager", +# ) -tok_kwargs = dict(use_fast=True) -if hf_auth: - tok_kwargs["token"] = hf_auth +# tok_kwargs = dict(use_fast=True) +# if hf_auth: +# tok_kwargs["token"] = hf_auth -tokenizer = AutoTokenizer.from_pretrained(model_id, **tok_kwargs) -if tokenizer.pad_token_id is None: - tokenizer.pad_token = tokenizer.eos_token +# tokenizer = AutoTokenizer.from_pretrained(model_id, **tok_kwargs) +# if tokenizer.pad_token_id is None: +# tokenizer.pad_token = tokenizer.eos_token -mp_kwargs = dict(torch_dtype=compute_dtype, **common) -if hf_auth: - mp_kwargs["token"] = hf_auth +# mp_kwargs = dict(torch_dtype=compute_dtype, **common) +# if hf_auth: +# mp_kwargs["token"] = hf_auth -model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs) -model.to(device) -model.eval() +# model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs) +# model.to(device) +# model.eval() + +# GEN = dict( +# max_new_tokens=64, +# do_sample=True, +# temperature=0.7, +# top_p=0.95, +# eos_token_id=tokenizer.eos_token_id, +# pad_token_id=tokenizer.pad_token_id, +# ) + +# def agent2(q: str) -> str: +# prompt = f"You are a concise Q&A assistant.\n\n{q}\n" +# inputs = tokenizer(prompt, return_tensors="pt").to(device) +# with torch.inference_mode(): +# out = model.generate(**inputs, **GEN) +# gen = out[0, inputs["input_ids"].shape[1]:] +# return tokenizer.decode(gen, skip_special_tokens=True).strip() + +# if __name__ == "__main__": +# question = "What is the capital of Japan?" +# print(agent2(question)) -GEN = dict( - max_new_tokens=64, - do_sample=True, - temperature=0.7, - top_p=0.95, - eos_token_id=tokenizer.eos_token_id, - pad_token_id=tokenizer.pad_token_id, -) def agent2(q: str) -> str: - prompt = f"You are a concise Q&A assistant.\n\n{q}\n" - inputs = tokenizer(prompt, return_tensors="pt").to(device) - with torch.inference_mode(): - out = model.generate(**inputs, **GEN) - gen = out[0, inputs["input_ids"].shape[1]:] - return tokenizer.decode(gen, skip_special_tokens=True).strip() - -if __name__ == "__main__": - question = "What is the capital of Japan?" - print(agent2(question)) + + return "Hello this is jetson" \ No newline at end of file From 6a26cab3fc73cff14795fee19cdd107777949023 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 9 Oct 2025 12:29:42 -0700 Subject: [PATCH 14/17] Testing Jetson --- llm/src/llm_game_federated.lf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf index d2b0834..863abca 100644 --- a/llm/src/llm_game_federated.lf +++ b/llm/src/llm_game_federated.lf @@ -1,5 +1,5 @@ ### llm.py file needs to be in the same directory -target Python { keepalive: true, files: ["llm_a.py", "llm_b_m2.py" ] } #"llm_b.py" +target Python { keepalive: true, files: ["llm_a.py", "llm_b_jetson.py" ] } #"llm_b.py" import LlmA, LlmB, Judge from "llm_base_class_federate.lf" From aef0ac957c855200119bbab82a2e7c8dce735c59 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 9 Oct 2025 12:43:07 -0700 Subject: [PATCH 15/17] Changed the file names in base class --- llm/src/llm_base_class_federate.lf | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf index 194d49a..14412a2 100644 --- a/llm/src/llm_base_class_federate.lf +++ b/llm/src/llm_base_class_federate.lf @@ -76,7 +76,7 @@ reactor LlmB { try: here = os.path.dirname(__file__) if here not in sys.path: sys.path.insert(0, here) - from llm_b_m2 import agent2 + from llm_b_jetson import agent2 act.schedule(1) except Exception as e: print("[LlmB] Preload failed:", e, flush=True) @@ -95,7 +95,7 @@ reactor LlmB { if self.running: return self.running = True q = user_in.value - from llm_b_m2 import agent2 + from llm_b_jetson import agent2 def agentB(): try: self.out_buffer = agent2(q) From c4c635372ea379070969773de15f8d95a1ac3e16 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 9 Oct 2025 12:49:56 -0700 Subject: [PATCH 16/17] Changed the RTI to jetson --- llm/src/llm_game_federated.lf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf index 863abca..3a9f677 100644 --- a/llm/src/llm_game_federated.lf +++ b/llm/src/llm_game_federated.lf @@ -11,7 +11,7 @@ preamble {= =} -federated reactor llm_game_federated at 10.218.100.95 { +federated reactor llm_game_federated at 10.155.214.175 { j = new Judge() llma = new LlmA() From 9d503d53028c700ac9a00f1ec33b851a424fb407 Mon Sep 17 00:00:00 2001 From: Deeksha-20-99 Date: Thu, 9 Oct 2025 14:12:41 -0700 Subject: [PATCH 17/17] corrected the ip for jetson orin --- llm/src/llm_game_federated.lf | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf index 3a9f677..d2b745c 100644 --- a/llm/src/llm_game_federated.lf +++ b/llm/src/llm_game_federated.lf @@ -11,7 +11,7 @@ preamble {= =} -federated reactor llm_game_federated at 10.155.214.175 { +federated reactor llm_game_federated at 10.155.241.175 { j = new Judge() llma = new LlmA()