From e29fac3b9a01da16fa41fbdaf8e17e88fda0df5b Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Tue, 16 Sep 2025 15:44:27 -0700
Subject: [PATCH 01/17] Add README for LF LLM demo

---
 llm/README.md | 2 ++
 1 file changed, 2 insertions(+)
 create mode 100644 llm/README.md

diff --git a/llm/README.md b/llm/README.md
new file mode 100644
index 0000000..3b7b658
--- /dev/null
+++ b/llm/README.md
@@ -0,0 +1,2 @@
+# LLM Demo
+

From 053b8d713906ee55d9535aab92ae4fdc34285329 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Tue, 16 Sep 2025 16:33:49 -0700
Subject: [PATCH 02/17] Adding work in progress code files for an llm example.
 Files: llm.py, which calls the llama-2-7b-chat model for simple question and
 answer, agent_llm.lf, which takes in the user input calls llm agent 1 and llm
 agent 2.

---
 llm/src/agent_llm.lf | 44 +++++++++++++++++++++
 llm/src/llm.py       | 94 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 138 insertions(+)
 create mode 100644 llm/src/agent_llm.lf
 create mode 100644 llm/src/llm.py

diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf
new file mode 100644
index 0000000..e5c7f90
--- /dev/null
+++ b/llm/src/agent_llm.lf
@@ -0,0 +1,44 @@
+target Python{
+    files: llm_textgeneration.py
+};
+
+preamble{=
+    from llm import agent1, agent2
+    
+=}
+
+reactor llm_a{
+
+    output user_in
+    reaction (startup)-> user_in{=
+        txt = input("Hey there!")
+        user_in.set(txt)
+    =}
+}
+
+reactor llm_b{
+    input llm_a_in
+    output llm_b_out
+    reaction (llm_a_in)-> llm_b_out{=
+        llm_b_out.set(llm_a_in.value)
+    =}
+}
+
+main reactor{
+    state response 
+    user_response = new llm_a()
+    llm_response = new llm_b()
+    // call llm a to respond to user
+    reaction (user_response.user_in)->llm_response.llm_a_in{=
+        
+        response = agent1(user_response.user_in.value)
+        llm_response.llm_a_in.set(response)
+    =}
+    
+    //llm b to respond to what llm a generated 
+    reaction (llm_response.llm_b_out){=
+        # llm_response.llm_a_in = response
+        agent2(llm_response.llm_b_out.value)
+    =}
+        
+}
\ No newline at end of file
diff --git a/llm/src/llm.py b/llm/src/llm.py
new file mode 100644
index 0000000..63b6234
--- /dev/null
+++ b/llm/src/llm.py
@@ -0,0 +1,94 @@
+### Import Libraries 
+import transformers
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+from torch import cuda, bfloat16
+
+### Add Your hugging face token here 
+hf_auth = "Add here"
+
+### Model to be chosen to act as an agent 
+model_id = "meta-llama/Llama-2-7b-chat-hf"  
+
+### To check if there is GPU
+has_cuda = torch.cuda.is_available()
+
+### To convert the model into 4bit quantization 
+bnb_config = None
+if has_cuda:
+    try:
+        import bitsandbytes as bnb  
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=torch.bfloat16,  
+        )
+    except Exception:
+        bnb_config = None  
+
+### calling pre-trained tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True)
+
+
+### calling pre-trained model
+model = AutoModelForCausalLM.from_pretrained(
+    model_id,
+    token=hf_auth,
+    device_map="auto" if has_cuda else None,
+    torch_dtype=torch.bfloat16 if has_cuda else torch.float32,
+    quantization_config=bnb_config,              
+    low_cpu_mem_usage=True,
+)
+
+model.eval()
+
+### agent 1
+def agent1(a):
+    user_query = a
+
+    prompt = f"You are a helpful assistant.\n\n{user_query}\n"
+
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=100,
+            do_sample=True,
+            temperature=0.3,
+        )
+
+    gen_tokens = outputs[0]
+    prompt_len = inputs["input_ids"].shape[1]
+    response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True)
+
+    print("LLM A response:", response)
+    return response
+
+### agent 2 
+def agent2(b):
+    user_query = b
+
+    prompt = f"Just summarize what the agent1 said: \n\n{user_query}\n\n"
+
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=100,
+            do_sample=True,
+            temperature=0.3,
+        )
+
+    gen_tokens = outputs[0]
+    prompt_len = inputs["input_ids"].shape[1]
+    response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True)
+    print("LLM B response:", response)
\ No newline at end of file

From 473c81f2978a7465e780c9e914c4ca869655d173 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Tue, 16 Sep 2025 16:37:32 -0700
Subject: [PATCH 03/17] changed the file name of the file to be included in
 agent_llm.lf

---
 llm/src/agent_llm.lf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf
index e5c7f90..35176fd 100644
--- a/llm/src/agent_llm.lf
+++ b/llm/src/agent_llm.lf
@@ -1,5 +1,5 @@
 target Python{
-    files: llm_textgeneration.py
+    files: llm.py
 };
 
 preamble{=

From 46522a14c9020089c117f67f39c5635c2e720bf0 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Fri, 19 Sep 2025 11:00:19 -0700
Subject: [PATCH 04/17] Added a quiz game. It is a game between two LLM models
 answering user questions and the model to respond the fastest wins

---
 llm/src/agent_llm.lf     |   8 +-
 llm/src/llm.py           | 114 +++++++++++-----------
 llm/src/llm_quiz_game.lf | 197 +++++++++++++++++++++++++++++++++++++++
 3 files changed, 257 insertions(+), 62 deletions(-)
 create mode 100644 llm/src/llm_quiz_game.lf

diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf
index 35176fd..5b5ab8e 100644
--- a/llm/src/agent_llm.lf
+++ b/llm/src/agent_llm.lf
@@ -7,7 +7,7 @@ preamble{=
     
 =}
 
-reactor llm_a{
+reactor LLM_a{
 
     output user_in
     reaction (startup)-> user_in{=
@@ -16,7 +16,7 @@ reactor llm_a{
     =}
 }
 
-reactor llm_b{
+reactor LLM_b{
     input llm_a_in
     output llm_b_out
     reaction (llm_a_in)-> llm_b_out{=
@@ -26,8 +26,8 @@ reactor llm_b{
 
 main reactor{
     state response 
-    user_response = new llm_a()
-    llm_response = new llm_b()
+    user_response = new LLM_a()
+    llm_response = new LLM_b()
     // call llm a to respond to user
     reaction (user_response.user_in)->llm_response.llm_a_in{=
         
diff --git a/llm/src/llm.py b/llm/src/llm.py
index 63b6234..93322f1 100644
--- a/llm/src/llm.py
+++ b/llm/src/llm.py
@@ -5,16 +5,19 @@
 from torch import cuda, bfloat16
 
 ### Add Your hugging face token here 
-hf_auth = "Add here"
+hf_auth = "Add your token here"
 
 ### Model to be chosen to act as an agent 
 model_id = "meta-llama/Llama-2-7b-chat-hf"  
+model_id_2 = "meta-llama/Llama-2-70b-chat-hf" 
 
-### To check if there is GPU
+### To check if there is GPU and convert it into float 16
 has_cuda = torch.cuda.is_available()
+dtype = torch.bfloat16 if has_cuda else torch.float32   
 
 ### To convert the model into 4bit quantization 
 bnb_config = None
+### if there is cuda then the model is converted to 4bit quantization
 if has_cuda:
     try:
         import bitsandbytes as bnb  
@@ -22,73 +25,68 @@
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
             bnb_4bit_use_double_quant=True,
-            bnb_4bit_compute_dtype=torch.bfloat16,  
+            bnb_4bit_compute_dtype=dtype,
         )
     except Exception:
         bnb_config = None  
 
 ### calling pre-trained tokenizer
-tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True)
-
-
-### calling pre-trained model
-model = AutoModelForCausalLM.from_pretrained(
-    model_id,
-    token=hf_auth,
+tokenizer   = AutoTokenizer.from_pretrained(model_id,   token=hf_auth, use_fast=True)
+tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, token=hf_auth, use_fast=True)
+for tok in (tokenizer, tokenizer_2):
+    if tok.pad_token_id is None:
+        tok.pad_token = tok.eos_token
+
+### since both the models have same device map and using 4bit quantization for both
+common = dict(
     device_map="auto" if has_cuda else None,
-    torch_dtype=torch.bfloat16 if has_cuda else torch.float32,
-    quantization_config=bnb_config,              
+    dtype=dtype,                 
     low_cpu_mem_usage=True,
 )
+if bnb_config is not None:
+    common["quantization_config"] = bnb_config
 
-model.eval()
-
-### agent 1
-def agent1(a):
-    user_query = a
-
-    prompt = f"You are a helpful assistant.\n\n{user_query}\n"
-
+### calling pre-trained model
+model   = AutoModelForCausalLM.from_pretrained(model_id,   token=hf_auth, **common)
+model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, token=hf_auth, **common)
+model.eval(); model_2.eval()
+
+
+
+### arguments for both the models 
+GEN_A = dict(max_new_tokens=24, do_sample=False, temperature=0.1,
+             eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id)
+GEN_B = dict(max_new_tokens=24, do_sample=False, temperature=0.1,
+             eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id)
+
+###to resturn only one line answers
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        idx = t.find(sep)
+        if idx > 0:
+            t = t[:idx]
+            break
+    return t.strip().strip(":").strip()
+
+###Calling agent1 from .lf code
+def agent1(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
     inputs = tokenizer(prompt, return_tensors="pt")
-
-    if has_cuda:
-        inputs = {k: v.to("cuda") for k, v in inputs.items()}
-
+    if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()}
     with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=100,
-            do_sample=True,
-            temperature=0.3,
-        )
-
-    gen_tokens = outputs[0]
+        out = model.generate(**inputs, **GEN_A)
     prompt_len = inputs["input_ids"].shape[1]
-    response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True)
-
-    print("LLM A response:", response)
-    return response
-
-### agent 2 
-def agent2(b):
-    user_query = b
-
-    prompt = f"Just summarize what the agent1 said: \n\n{user_query}\n\n"
-
-    inputs = tokenizer(prompt, return_tensors="pt")
-
-    if has_cuda:
-        inputs = {k: v.to("cuda") for k, v in inputs.items()}
-
+    result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)
+    return postprocess(result)
+
+###Calling agent2 from .lf code
+def agent2(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer_2(prompt, return_tensors="pt")
+    if has_cuda: inputs = {k: v.to("cuda") for k, v in inputs.items()}
     with torch.no_grad():
-        outputs = model.generate(
-            **inputs,
-            max_new_tokens=100,
-            do_sample=True,
-            temperature=0.3,
-        )
-
-    gen_tokens = outputs[0]
+        out = model_2.generate(**inputs, **GEN_B)
     prompt_len = inputs["input_ids"].shape[1]
-    response = tokenizer.decode(gen_tokens[prompt_len:], skip_special_tokens=True)
-    print("LLM B response:", response)
\ No newline at end of file
+    result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True)
+    return postprocess(result)
\ No newline at end of file
diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf
new file mode 100644
index 0000000..85e89a4
--- /dev/null
+++ b/llm/src/llm_quiz_game.lf
@@ -0,0 +1,197 @@
+### llm.py file needs to be in the same directory
+target Python { keepalive: true, files: ["llm.py"] }
+
+preamble {=
+  import threading
+  import time 
+  from llm import agent1, agent2  # your Python functions
+
+  def keyboard_prompt(reactor, action):
+      while True:
+          time.sleep(5)
+          action.schedule(None) 
+=}
+
+### Reactor for handling user keyboard input
+reactor KeyboardInput {
+  state th
+  state terminate = False
+  state eof = False
+  state buffer = ""          
+
+  physical action line       
+  output prompt
+  output quit
+
+  reaction(startup) -> line {=
+    def reader():
+      while not self.terminate:
+
+        s = input("Enter the quiz question\n")
+        if s == "":                 
+          self.eof = True
+          line.schedule(0)         
+          break
+        elif s.lower().strip() == "quit":   
+          self.eof = True
+          line.schedule(0)
+          break
+        else:
+          self.buffer = s
+          line.schedule(1)            
+    self.th = threading.Thread(target=reader, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(line) -> prompt, quit {=
+    if self.eof:
+      quit.set()
+      environment().sync_shutdown()
+    else:
+      prompt.set(self.buffer)       
+  =}
+
+  reaction(shutdown) {=
+    self.terminate = True
+    if self.th and self.th.is_alive():
+      self.th.join()
+  =}
+}
+
+### Reactor for calling agent 1 
+reactor LlmA {
+  state th
+  state running = False
+  state out_buffer = ""
+
+  input user_in
+  physical action done   
+  output answer         
+
+
+  reaction(user_in) -> done {=
+    if self.running:
+      return
+    self.running = True
+    query = user_in.value
+    def agentA():
+      try:
+        self.out_buffer = agent1(query)   
+      finally:
+        done.schedule(1)              
+    self.th = threading.Thread(target=agentA, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(done) -> answer {=
+    self.running = False
+    answer.set(self.out_buffer)
+  =}
+}
+
+
+### Reactor for calling agent 2 
+reactor LlmB {
+  state th
+  state running = False
+  state out_buffer = ""
+  input user_in
+  output answer
+
+  physical action done
+
+  reaction(user_in)->done{=
+    if self.running:
+      return
+    self.running = True
+    query = user_in.value
+    def agentB():
+      try:
+        self.out_buffer = agent2(query)
+      finally:
+        done.schedule(1)
+    self.th = threading.Thread(target=agentB, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(done)->answer{=
+    self.running = False
+    answer.set(self.out_buffer)
+  =}
+
+}
+
+###Judge reactor to determine which agent responds first
+reactor Judge{
+  input query
+  input llma
+  input llmb
+  output ask           
+
+  state waiting = False
+  state logical_base_time = 0
+  state physical_base_time = 0
+  state winner = ""
+
+  logical action timeout(60 sec)
+
+  reaction(query) -> timeout, ask {=
+    self.waiting = True
+    self.winner = ""
+    self.logical_base_time  = lf.time.logical_elapsed()
+    self.physical_base_time = lf.time.physical_elapsed()
+    timeout.schedule(0)
+    print(f"\n\n\nQuery: {query.value}\n")
+    print("waiting...\n")
+    ask.set(query.value)     
+  =}
+
+  reaction(llma) {=
+    if not self.waiting: 
+      return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms")
+    print(f"{llma.value}")
+  =}
+
+  reaction(llmb) {=
+    if not self.waiting: 
+      return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms")
+    print(f"{llmb.value}")
+  =}
+
+  reaction(timeout) {=
+    if not self.waiting: 
+      return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms")
+  =}
+}
+
+
+main reactor {
+  llma_response = new LlmA()
+  llmb_response = new LlmB()
+  keyboard = new KeyboardInput()
+  j = new Judge()
+
+  keyboard.prompt -> j.query
+  j.ask -> llma_response.user_in
+  j.ask -> llmb_response.user_in
+  llma_response.answer -> j.llma
+  llmb_response.answer -> j.llmb
+}
\ No newline at end of file

From 9d9ee262ac2adcc677d09d4ecac9e27669c2e864 Mon Sep 17 00:00:00 2001
From: Deeksha Prahlad <112724341+Deeksha-20-99@users.noreply.github.com>
Date: Fri, 19 Sep 2025 11:26:33 -0700
Subject: [PATCH 05/17] Updated the  README.md for instructions to run the quiz
 game

---
 llm/README.md | 93 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 93 insertions(+)

diff --git a/llm/README.md b/llm/README.md
index 3b7b658..c7b4000 100644
--- a/llm/README.md
+++ b/llm/README.md
@@ -1,2 +1,95 @@
 # LLM Demo
 
+# Overview
+This is a quiz-style game between two LLM agents. For each user question typed at the keyboard, both agents answer in parallel. The Judge announces whichever answer arrives first (or a timeout if neither responds within 60 sec), and prints per-question elapsed logical and physical times. 
+
+# Pre-requisites 
+
+You need Python installed, as llm.py is written in Python.
+
+## Library Dependencies
+To run this project, the following dependencies are required. The model used in this repository has been quantized using 4-bit precision (bnb_4bit) and relies on bitsandbytes for efficient matrix operations and memory optimization. So specific versions of bitsandbytes, torch, and torchvision are mandatory for compatibility. 
+While newer versions of other dependencies may work, the specific versions listed below have been tested and are recommended for optimal performance.
+
+It is highly recommended to create a Python virtual environment or a Conda environment to manage dependencies. The available options for environment setup are listed below.
+  
+```
+pip install accelerate
+pip install transformers
+pip install tokenizers
+pip install bitsandbytes>=0.43.0
+pip install torch
+pip install torchvision
+```
+
+## System Requirements  
+
+To ensure optimal performance, the following hardware and software requirements are utilized. \
+**Note:** To replicate this model, you can use any equivalent hardware that meets the computational requirements.
+
+### Hardware Requirements   
+- **GPU**: NVIDIA RTX A6000  
+
+### Software Requirements  
+- **Python** (Ensure Python is installed)  
+- **CUDA Version**: 12.8  
+- **NVIDIA-SMI**: For monitoring GPU performance and memory utilization  
+
+### Model Dependencies  
+- **Pre-trained Models**:  [meta-llama/Llama-2-7b-chat-hf](https://huggingface.co/meta-llama/Llama-2-7b-chat-hf)  [meta-llama/Llama-2-70b-chat-hf](https://huggingface.co/meta-llama/Llama-2-70b-chat-hf) 
+**Note:** Please access and use the pre-trained models, authentication keys must be obtained from the [Hugging Face repository](https://huggingface.co/settings/tokens). Ensure you have a valid API token and configure authentication.
+
+Make sure the environment is properly configured to use CUDA for optimal GPU acceleration.
+
+# Files and directories in this repository
+  - **`llm.py`** - Contains the logic to load and call LLM models from the Hugging Face pretrained hub.
+  - **`llm_quiz_game.lf`** - Lingua Franca program that defines the quiz game reactors (Keyboard input, LLM agents, and Judge).
+
+# Execution Workflow 
+
+### Step 1: 
+Run the **`llm_quiz_game.lf`**.  
+
+**Note:**  
+- Ensure that you specify the correct file paths
+
+Run the following commands:  
+
+```
+lfc src/llm_quiz_game.lf
+```
+
+### Step 2: Run the binary file and input the quiz question
+Run the following commands:  
+
+```
+./bin/llm_quiz_game
+```
+
+The system will ask for entering the quiz question which is to be obtained from the keyboard input.
+
+Example output printed on the terminal:
+ 
+<pre>
+
+--------------------------------------------------
+---- System clock resolution: 1 nsec
+---- Start execution on Fri Sep 19 10:46:31 2025 ---- plus 772215861 nanoseconds
+Enter the quiz question
+What is the capital of South Korea?
+Query: What is the capital of South Korea?
+
+waiting...
+
+Winner: LLM-B | logical 1184 ms | physical 1184 ms
+Answer: Seoul.
+--------------------------------------------------
+
+</pre>
+
+### Step 3: Monitoring GPU Performance (Optional)
+In another terminal, monitor GPU performance and memory utilization while running the scripts, please use NVIDIA-SMI:
+```
+nvidia-smi
+```
+# Contributors

From fe1f6054081268e9272d552c75342aaf9a8de9e1 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Fri, 19 Sep 2025 12:09:07 -0700
Subject: [PATCH 06/17] Removing the older version of the file agent_llm.lf

---
 llm/src/agent_llm.lf | 44 --------------------------------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 llm/src/agent_llm.lf

diff --git a/llm/src/agent_llm.lf b/llm/src/agent_llm.lf
deleted file mode 100644
index 5b5ab8e..0000000
--- a/llm/src/agent_llm.lf
+++ /dev/null
@@ -1,44 +0,0 @@
-target Python{
-    files: llm.py
-};
-
-preamble{=
-    from llm import agent1, agent2
-    
-=}
-
-reactor LLM_a{
-
-    output user_in
-    reaction (startup)-> user_in{=
-        txt = input("Hey there!")
-        user_in.set(txt)
-    =}
-}
-
-reactor LLM_b{
-    input llm_a_in
-    output llm_b_out
-    reaction (llm_a_in)-> llm_b_out{=
-        llm_b_out.set(llm_a_in.value)
-    =}
-}
-
-main reactor{
-    state response 
-    user_response = new LLM_a()
-    llm_response = new LLM_b()
-    // call llm a to respond to user
-    reaction (user_response.user_in)->llm_response.llm_a_in{=
-        
-        response = agent1(user_response.user_in.value)
-        llm_response.llm_a_in.set(response)
-    =}
-    
-    //llm b to respond to what llm a generated 
-    reaction (llm_response.llm_b_out){=
-        # llm_response.llm_a_in = response
-        agent2(llm_response.llm_b_out.value)
-    =}
-        
-}
\ No newline at end of file

From b0206643f229046eb82e3b59249a0ff493e3efef Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Mon, 22 Sep 2025 12:16:07 -0700
Subject: [PATCH 07/17] Modified comments to the program

---
 llm/src/llm_quiz_game.lf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf
index 85e89a4..cd9584a 100644
--- a/llm/src/llm_quiz_game.lf
+++ b/llm/src/llm_quiz_game.lf
@@ -4,7 +4,7 @@ target Python { keepalive: true, files: ["llm.py"] }
 preamble {=
   import threading
   import time 
-  from llm import agent1, agent2  # your Python functions
+  from llm import agent1, agent2  
 
   def keyboard_prompt(reactor, action):
       while True:

From cc0a08a5e504e562e4faeccbbfeef829627d25dd Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Tue, 23 Sep 2025 16:28:33 -0700
Subject: [PATCH 08/17]  created the files for quiz game between two llm models
 using main reactor and also added a federated execution

---
 llm/src/llm_a.py                   |  77 +++++++
 llm/src/llm_b.py                   |  78 +++++++
 llm/src/llm_base_class.lf          | 176 ++++++++++++++
 llm/src/llm_base_class_federate.lf | 354 ++++++++++++++++++++++++++++
 llm/src/llm_game_federated.lf      |  40 ++++
 llm/src/llm_quiz_game.lf           | 359 +++++++++++++++--------------
 6 files changed, 907 insertions(+), 177 deletions(-)
 create mode 100644 llm/src/llm_a.py
 create mode 100644 llm/src/llm_b.py
 create mode 100644 llm/src/llm_base_class.lf
 create mode 100644 llm/src/llm_base_class_federate.lf
 create mode 100644 llm/src/llm_game_federated.lf

diff --git a/llm/src/llm_a.py b/llm/src/llm_a.py
new file mode 100644
index 0000000..df5faf3
--- /dev/null
+++ b/llm/src/llm_a.py
@@ -0,0 +1,77 @@
+# llm_a.py — Agent 1 (7B)
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+# <<< put your token here >>>
+hf_auth = "add token here "
+
+# Model to be chosen to act as an agent
+model_id = "meta-llama/Llama-2-7b-chat-hf"
+
+# Require GPU (you said it must work only on GPU)
+has_cuda = torch.cuda.is_available()
+if not has_cuda:
+    raise RuntimeError("CUDA GPU required for this configuration.")
+dtype = torch.bfloat16 if has_cuda else torch.float32
+
+# 4-bit quantization
+bnb_config = None
+if has_cuda:
+    try:
+        import bitsandbytes as bnb  # noqa: F401
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=dtype,
+        )
+    except Exception:
+        bnb_config = None
+
+# Tokenizer
+tokenizer = AutoTokenizer.from_pretrained(model_id, token=hf_auth, use_fast=True)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+# Shared kwargs
+common = dict(
+    device_map="auto" if has_cuda else None,
+    dtype=dtype,
+    low_cpu_mem_usage=True,
+)
+if bnb_config is not None:
+    common["quantization_config"] = bnb_config
+
+# Model
+model = AutoModelForCausalLM.from_pretrained(model_id, token=hf_auth, **common)
+model.eval()
+
+# Generation args
+GEN_A = dict(
+    max_new_tokens=24, do_sample=False, temperature=0.1,
+    eos_token_id=tokenizer.eos_token_id, pad_token_id=tokenizer.pad_token_id
+)
+
+# One-line postprocess
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        idx = t.find(sep)
+        if idx > 0:
+            t = t[:idx]
+            break
+    return t.strip().strip(":").strip()
+
+# Agent 1 entrypoint
+def agent1(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer(prompt, return_tensors="pt")
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model.generate(**inputs, **GEN_A)
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer.decode(out[0][prompt_len:], skip_special_tokens=True)
+    print(result)
+    return postprocess(result)
\ No newline at end of file
diff --git a/llm/src/llm_b.py b/llm/src/llm_b.py
new file mode 100644
index 0000000..513d6c2
--- /dev/null
+++ b/llm/src/llm_b.py
@@ -0,0 +1,78 @@
+
+# llm_b.py — Agent 2 (70B)
+
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+# <<< put your token here >>>
+hf_auth = "add token here"
+
+# Model to be chosen to act as an agent
+model_id_2 = "meta-llama/Llama-2-70b-chat-hf"
+
+# Require GPU (GPU-only)
+has_cuda = torch.cuda.is_available()
+if not has_cuda:
+    raise RuntimeError("CUDA GPU required for this configuration.")
+dtype = torch.bfloat16 if has_cuda else torch.float32
+
+# 4-bit quantization
+bnb_config = None
+if has_cuda:
+    try:
+        import bitsandbytes as bnb  # noqa: F401
+        bnb_config = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=dtype,
+        )
+    except Exception:
+        bnb_config = None
+
+# Tokenizer
+tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, token=hf_auth, use_fast=True)
+if tokenizer_2.pad_token_id is None:
+    tokenizer_2.pad_token = tokenizer_2.eos_token
+
+# Shared kwargs
+common = dict(
+    device_map="auto" if has_cuda else None,
+    dtype=dtype,
+    low_cpu_mem_usage=True,
+)
+if bnb_config is not None:
+    common["quantization_config"] = bnb_config
+
+# Model
+model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, token=hf_auth, **common)
+model_2.eval()
+
+# Generation args
+GEN_B = dict(
+    max_new_tokens=24, do_sample=False, temperature=0.1,
+    eos_token_id=tokenizer_2.eos_token_id, pad_token_id=tokenizer_2.pad_token_id
+)
+
+# One-line postprocess
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        idx = t.find(sep)
+        if idx > 0:
+            t = t[:idx]
+            break
+    return t.strip().strip(":").strip()
+
+# Agent 2 entrypoint
+def agent2(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer_2(prompt, return_tensors="pt")
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    with torch.no_grad():
+        out = model_2.generate(**inputs, **GEN_B)
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True)
+    print(result)
+    return postprocess(result)
\ No newline at end of file
diff --git a/llm/src/llm_base_class.lf b/llm/src/llm_base_class.lf
new file mode 100644
index 0000000..d1eae4e
--- /dev/null
+++ b/llm/src/llm_base_class.lf
@@ -0,0 +1,176 @@
+target Python 
+
+### Reactor for handling user keyboard input
+reactor KeyboardInput {
+  state th
+  state terminate = False
+  state eof = False
+  state buffer = ""          
+
+  physical action line       
+  output prompt
+  output quit
+
+  reaction(startup) -> line {=
+    def reader():
+      while not self.terminate:
+
+        s = input("Enter the quiz question\n")
+        if s == "":                 
+          self.eof = True
+          line.schedule(0)         
+          break
+        elif s.lower().strip() == "quit":   
+          self.eof = True
+          line.schedule(0)
+          break
+        else:
+          self.buffer = s
+          line.schedule(1)            
+    self.th = threading.Thread(target=reader, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(line) -> prompt, quit {=
+    if self.eof:
+      quit.set()
+      environment().sync_shutdown()
+    else:
+      prompt.set(self.buffer)       
+  =}
+
+  reaction(shutdown) {=
+    self.terminate = True
+    if self.th and self.th.is_alive():
+      self.th.join()
+  =}
+}
+
+
+
+### Reactor for calling agent 1 
+reactor LlmA {
+  state th
+  state running = False
+  state out_buffer = ""
+
+  input user_in
+  physical action done   
+  output answer         
+
+
+  reaction(user_in) -> done {=
+    if self.running:
+      return
+    self.running = True
+    query = user_in.value
+    def agentA():
+      try:
+        self.out_buffer = agent1(query)   
+      finally:
+        done.schedule(1)              
+    self.th = threading.Thread(target=agentA, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(done) -> answer {=
+    self.running = False
+    answer.set(self.out_buffer)
+  =}
+}
+
+
+
+// ### Reactor for calling agent 2 
+reactor LlmB {
+  state th
+  state running = False
+  state out_buffer = ""
+  input user_in
+  output answer
+
+  physical action done
+
+  reaction(user_in)->done{=
+    if self.running:
+      return
+    self.running = True
+    query = user_in.value
+    def agentB():
+      try:
+        self.out_buffer = agent2(query)
+      finally:
+        done.schedule(1)
+    self.th = threading.Thread(target=agentB, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(done)->answer{=
+    self.running = False
+    answer.set(self.out_buffer)
+  =}
+
+}
+
+
+
+// ###Judge reactor to determine which agent responds first
+reactor Judge{
+  input query
+  input llma
+  input llmb
+  output ask           
+
+  state waiting = False
+  state logical_base_time = 0
+  state physical_base_time = 0
+  state winner = ""
+
+  logical action timeout(60 sec)
+
+  reaction(query) -> timeout, ask {=
+    self.waiting = True
+    self.winner = ""
+    self.logical_base_time  = lf.time.logical_elapsed()
+    self.physical_base_time = lf.time.physical_elapsed()
+    timeout.schedule(0)
+    print(f"\n\n\nQuery: {query.value}\n")
+    print("waiting...\n")
+    ask.set(query.value)     
+  =}
+
+  reaction(llma) {=
+    if not self.waiting: 
+      return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms")
+    print(f"{llma.value}")
+  =}
+
+  reaction(llmb) {=
+    if not self.waiting: 
+      return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms")
+    print(f"{llmb.value}")
+  =}
+
+  reaction(timeout) {=
+    if not self.waiting: 
+      return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms")
+  =}
+}
diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf
new file mode 100644
index 0000000..c5638d8
--- /dev/null
+++ b/llm/src/llm_base_class_federate.lf
@@ -0,0 +1,354 @@
+target Python 
+
+### Reactor for handling user keyboard input
+
+// reactor KeyboardInput {
+//   state th
+//   state terminate = False
+//   state eof = False
+//   state buffer = ""
+
+//   physical action line
+//   output prompt
+//   output quit
+
+//   reaction(startup) -> line {=
+//     import sys
+//     import threading
+//     import time
+
+//     def reader():
+//       while not self.terminate:
+//         s = input("Enter the quiz question\n")
+//         if s == "":
+//           self.eof = True
+//           try: line.schedule(0)
+//           except Exception as e: print("[keyboard] schedule EOF failed:", e, flush=True)
+//           break
+//         elif s.lower().strip() == "quit":
+//           self.eof = True
+//           try: line.schedule(0)
+//           except Exception as e: print("[keyboard] schedule quit failed:", e, flush=True)
+//           break
+//         else:
+//           self.buffer = s
+//           try: line.schedule(1)   # small logical hop
+//           except Exception as e:
+//             print("[keyboard] schedule failed:", e, flush=True)
+//             break
+//     self.th = threading.Thread(target=reader, daemon=True)
+//     self.th.start()
+//   =}
+
+//   reaction(line) -> prompt, quit {=
+//     if self.eof:
+//       quit.set()
+//       environment().sync_shutdown()
+//     else:
+//       prompt.set(self.buffer)
+//   =}
+
+//   reaction(shutdown) {=
+//     self.terminate = True
+//     if self.th and self.th.is_alive():
+//       self.th.join()
+//   =}
+// }
+
+### Reactor for calling agent 1 
+reactor LlmA {
+  state th
+  state running = False
+  state out_buffer = ""
+  state ready = False
+
+  input  user_in
+  physical action done 
+  physical action notify_ready 
+  output answer
+  output ready_out
+
+  reaction(startup) {=
+    import os, sys, importlib.util, threading
+    def _load():
+      try:
+        here = os.path.dirname(__file__)
+        if here not in sys.path: sys.path.insert(0, here)
+        from llm_a import agent1
+        notify_ready.schedule(0)
+      except Exception as e:
+        print("[LlmA] Preload failed:", e, flush=True)
+    threading.Thread(target=_load, daemon=True).start()
+  =}
+
+  reaction(notify_ready) -> ready_out {=
+    self.ready = True
+    ready_out.set(True)
+  =}
+
+  reaction(user_in) -> done {=
+    import threading
+    if not self.ready: return
+    if self.running: return
+    self.running = True
+    q = user_in.value
+    from llm_a import agent1
+    def agentA():
+      try:
+        self.out_buffer = agent1(q)
+      finally:
+        try: done.schedule(5)
+        except Exception as e: print("[LlmA] schedule failed:", e, flush=True)
+    self.th = threading.Thread(target=agentA, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(done) -> answer {=
+    self.running = False
+    answer.set(self.out_buffer)
+  =}
+}
+
+// ### Reactor for calling agent 2 
+reactor LlmB {
+  state th
+  state running = False
+  state out_buffer = ""
+  state ready = False
+
+  input  user_in
+  physical action done 
+  physical action notify_ready 
+  output answer
+  output ready_out
+
+  reaction(startup) {=
+    import os, sys, importlib.util, threading
+    def _load():
+      try:
+        here = os.path.dirname(__file__)
+        if here not in sys.path: sys.path.insert(0, here)
+        from llm_b import agent2
+        notify_ready.schedule(0)
+      except Exception as e:
+        print("[LlmB] Preload failed:", e, flush=True)
+    threading.Thread(target=_load, daemon=True).start()
+  =}
+
+  reaction(notify_ready) -> ready_out {=
+    self.ready = True
+    ready_out.set(True)
+  =}
+
+  reaction(user_in) -> done {=
+    import threading
+    if not self.ready: return
+    if self.running: return
+    self.running = True
+    q = user_in.value
+    from llm_b import agent2
+    def agentB():
+      try:
+        self.out_buffer = agent2(q)
+      finally:
+        try: done.schedule(5)
+        except Exception as e: print("[LlmB] schedule failed:", e, flush=True)
+    self.th = threading.Thread(target=agentB, daemon=True)
+    self.th.start()
+  =}
+
+  reaction(done) -> answer {=
+    self.running = False
+    answer.set(self.out_buffer)
+  =}
+}
+// ###Judge reactor to determine which agent responds first
+// reactor Judge{
+//   input query
+//   input llma
+//   input llmb
+//   output ask           
+
+//   state waiting = False
+//   state logical_base_time = 0
+//   state physical_base_time = 0
+//   state winner = ""
+
+//   logical action timeout(60 sec)
+
+//   reaction(query) -> timeout, ask {=
+//     self.waiting = True
+//     self.winner = ""
+//     self.logical_base_time  = lf.time.logical_elapsed()
+//     self.physical_base_time = lf.time.physical_elapsed()
+//     timeout.schedule(0)
+//     print(f"\n\n\nQuery: {query.value}\n")
+//     print("waiting...\n")
+//     ask.set(query.value)     
+//   =}
+
+//   reaction(llma) {=
+//     if not self.waiting: 
+//       return
+//     self.waiting = False
+//     logical_now  = lf.time.logical_elapsed()
+//     physical_now = lf.time.physical_elapsed()
+//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+//     print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms")
+//     print(f"{llma.value}")
+//   =}
+
+//   reaction(llmb) {=
+//     if not self.waiting: 
+//       return
+//     self.waiting = False
+//     logical_now  = lf.time.logical_elapsed()
+//     physical_now = lf.time.physical_elapsed()
+//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+//     print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms")
+//     print(f"{llmb.value}")
+//   =}
+
+//   reaction(timeout) {=
+//     if not self.waiting: 
+//       return
+//     self.waiting = False
+//     logical_now  = lf.time.logical_elapsed()
+//     physical_now = lf.time.physical_elapsed()
+//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+//     print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms")
+//   =}
+// }
+
+reactor Judge {
+  state th
+  state reader_started = False
+  state terminate = False
+  state eof = False
+  state buffer = ""
+  state waiting = False
+  state logical_base_time = 0
+  state physical_base_time = 0
+  input  ready_a
+  input  ready_b
+  state  a_ready = False
+  state  b_ready = False
+  physical action line 
+  physical action tick 
+  logical  action timeout(60 sec)
+  output ask
+  input  llma
+  input  llmb
+  output quit
+
+  reaction(startup) {=
+    print("[Judge] Waiting for models to load...", flush=True)
+  =}
+
+  reaction(ready_a) {=
+    self.a_ready = True
+    if self.a_ready and self.b_ready and not self.reader_started:
+      import sys, threading
+      def reader():
+        while not self.terminate:
+          s = input("Enter the quiz question (or 'quit')\n")
+          if s == "" or s.lower().strip() == "quit":
+            self.eof = True
+            try: line.schedule(0)
+            except Exception as e: print("[Judge] schedule EOF failed:", e, flush=True)
+            break
+          else:
+            self.buffer = s
+            try: line.schedule(1)
+            except Exception as e:
+              print("[Judge] schedule line failed:", e, flush=True)
+              break
+      self.reader_started = True
+      print("[Judge] Models ready. You can ask questions now.", flush=True)
+      self.th = threading.Thread(target=reader, daemon=True)
+      self.th.start()
+  =}
+
+  reaction(ready_b) {=
+    self.b_ready = True
+    if self.a_ready and self.b_ready and not self.reader_started:
+      import sys, threading
+      def reader():
+        while not self.terminate:
+          s = input("Enter the quiz question (or 'quit')\n")
+          if s == "" or s.lower().strip() == "quit":
+            self.eof = True
+            try: line.schedule(0)
+            except Exception as e: print("[Judge] schedule EOF failed:", e, flush=True)
+            break
+          else:
+            self.buffer = s
+            try: line.schedule(1)
+            except Exception as e:
+              print("[Judge] schedule line failed:", e, flush=True)
+              break
+      self.reader_started = True
+      print("[Judge] Models ready. You can ask questions now.", flush=True)
+      self.th = threading.Thread(target=reader, daemon=True)
+      self.th.start()
+  =}
+
+  reaction(line) -> tick, ask, timeout, quit {=
+    if self.eof:
+      quit.set()
+      environment().sync_shutdown()
+    else:
+      self.waiting = True
+      self.logical_base_time  = lf.time.logical_elapsed()
+      self.physical_base_time = lf.time.physical_elapsed()
+      timeout.schedule(0)
+      print(f"\n\n\nQuery: {self.buffer}\n", flush=True)
+      print("waiting...\n", flush=True)
+      tick.schedule(5)
+  =}
+
+  reaction(tick) -> ask {=
+    ask.set(self.buffer)
+  =}
+
+  reaction(llma) {=
+    if not self.waiting: return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms", flush=True)
+    print(f"{llma.value}", flush=True)
+  =}
+
+  reaction(llmb) {=
+    if not self.waiting: return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms", flush=True)
+    print(f"{llmb.value}", flush=True)
+  =}
+
+  reaction(timeout) {=
+    if not self.waiting: return
+    self.waiting = False
+    logical_now  = lf.time.logical_elapsed()
+    physical_now = lf.time.physical_elapsed()
+    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+    print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms", flush=True)
+  =}
+
+  reaction(shutdown) {=
+    self.terminate = True
+    if self.th and self.th.is_alive():
+      self.th.join()
+  =}
+}
\ No newline at end of file
diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf
new file mode 100644
index 0000000..0a4dcd0
--- /dev/null
+++ b/llm/src/llm_game_federated.lf
@@ -0,0 +1,40 @@
+### llm.py file needs to be in the same directory
+target Python { keepalive: true, files: ["llm_a.py", "llm_b.py"] }
+// import KeyboardInput from "llm_base_class_federate.lf"
+import LlmA from "llm_base_class_federate.lf"
+import LlmB from "llm_base_class_federate.lf"
+import Judge from "llm_base_class_federate.lf"
+
+preamble {=
+  import threading
+  import time 
+  from llm_a import agent1
+  from llm_b import agent2
+=}
+
+
+federated reactor llm_game_federated at 10.218.100.95 {
+  // llma_response_f = new LlmA() 
+  // llmb_response_f = new LlmB() 
+  // keyboard_f = new KeyboardInput() 
+  // j_f = new Judge() 
+
+  // keyboard_f.prompt -> j_f.query
+  // j_f.ask -> llma_response_f.user_in
+  // j_f.ask -> llmb_response_f.user_in
+  // llma_response_f.answer -> j_f.llma
+  // llmb_response_f.answer -> j_f.llmb         
+  j    = new Judge()
+  llma = new LlmA()
+  llmb = new LlmB()
+
+  j.ask -> llma.user_in
+  j.ask -> llmb.user_in
+  llma.answer -> j.llma
+  llmb.answer -> j.llmb
+
+  llma.ready_out -> j.ready_a
+  llmb.ready_out -> j.ready_b
+
+}
+
diff --git a/llm/src/llm_quiz_game.lf b/llm/src/llm_quiz_game.lf
index cd9584a..7ba9d6b 100644
--- a/llm/src/llm_quiz_game.lf
+++ b/llm/src/llm_quiz_game.lf
@@ -1,188 +1,17 @@
 ### llm.py file needs to be in the same directory
 target Python { keepalive: true, files: ["llm.py"] }
 
+import KeyboardInput from "llm_base_class.lf"
+import LlmA from "llm_base_class.lf"
+import LlmB from "llm_base_class.lf"
+import Judge from "llm_base_class.lf"
+
 preamble {=
   import threading
   import time 
   from llm import agent1, agent2  
-
-  def keyboard_prompt(reactor, action):
-      while True:
-          time.sleep(5)
-          action.schedule(None) 
 =}
 
-### Reactor for handling user keyboard input
-reactor KeyboardInput {
-  state th
-  state terminate = False
-  state eof = False
-  state buffer = ""          
-
-  physical action line       
-  output prompt
-  output quit
-
-  reaction(startup) -> line {=
-    def reader():
-      while not self.terminate:
-
-        s = input("Enter the quiz question\n")
-        if s == "":                 
-          self.eof = True
-          line.schedule(0)         
-          break
-        elif s.lower().strip() == "quit":   
-          self.eof = True
-          line.schedule(0)
-          break
-        else:
-          self.buffer = s
-          line.schedule(1)            
-    self.th = threading.Thread(target=reader, daemon=True)
-    self.th.start()
-  =}
-
-  reaction(line) -> prompt, quit {=
-    if self.eof:
-      quit.set()
-      environment().sync_shutdown()
-    else:
-      prompt.set(self.buffer)       
-  =}
-
-  reaction(shutdown) {=
-    self.terminate = True
-    if self.th and self.th.is_alive():
-      self.th.join()
-  =}
-}
-
-### Reactor for calling agent 1 
-reactor LlmA {
-  state th
-  state running = False
-  state out_buffer = ""
-
-  input user_in
-  physical action done   
-  output answer         
-
-
-  reaction(user_in) -> done {=
-    if self.running:
-      return
-    self.running = True
-    query = user_in.value
-    def agentA():
-      try:
-        self.out_buffer = agent1(query)   
-      finally:
-        done.schedule(1)              
-    self.th = threading.Thread(target=agentA, daemon=True)
-    self.th.start()
-  =}
-
-  reaction(done) -> answer {=
-    self.running = False
-    answer.set(self.out_buffer)
-  =}
-}
-
-
-### Reactor for calling agent 2 
-reactor LlmB {
-  state th
-  state running = False
-  state out_buffer = ""
-  input user_in
-  output answer
-
-  physical action done
-
-  reaction(user_in)->done{=
-    if self.running:
-      return
-    self.running = True
-    query = user_in.value
-    def agentB():
-      try:
-        self.out_buffer = agent2(query)
-      finally:
-        done.schedule(1)
-    self.th = threading.Thread(target=agentB, daemon=True)
-    self.th.start()
-  =}
-
-  reaction(done)->answer{=
-    self.running = False
-    answer.set(self.out_buffer)
-  =}
-
-}
-
-###Judge reactor to determine which agent responds first
-reactor Judge{
-  input query
-  input llma
-  input llmb
-  output ask           
-
-  state waiting = False
-  state logical_base_time = 0
-  state physical_base_time = 0
-  state winner = ""
-
-  logical action timeout(60 sec)
-
-  reaction(query) -> timeout, ask {=
-    self.waiting = True
-    self.winner = ""
-    self.logical_base_time  = lf.time.logical_elapsed()
-    self.physical_base_time = lf.time.physical_elapsed()
-    timeout.schedule(0)
-    print(f"\n\n\nQuery: {query.value}\n")
-    print("waiting...\n")
-    ask.set(query.value)     
-  =}
-
-  reaction(llma) {=
-    if not self.waiting: 
-      return
-    self.waiting = False
-    logical_now  = lf.time.logical_elapsed()
-    physical_now = lf.time.physical_elapsed()
-    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
-    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
-    print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms")
-    print(f"{llma.value}")
-  =}
-
-  reaction(llmb) {=
-    if not self.waiting: 
-      return
-    self.waiting = False
-    logical_now  = lf.time.logical_elapsed()
-    physical_now = lf.time.physical_elapsed()
-    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
-    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
-    print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms")
-    print(f"{llmb.value}")
-  =}
-
-  reaction(timeout) {=
-    if not self.waiting: 
-      return
-    self.waiting = False
-    logical_now  = lf.time.logical_elapsed()
-    physical_now = lf.time.physical_elapsed()
-    logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
-    physical_ms = int((physical_now - self.physical_base_time) / 1000000)
-    print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms")
-  =}
-}
-
-
 main reactor {
   llma_response = new LlmA()
   llmb_response = new LlmB()
@@ -194,4 +23,180 @@ main reactor {
   j.ask -> llmb_response.user_in
   llma_response.answer -> j.llma
   llmb_response.answer -> j.llmb
-}
\ No newline at end of file
+}
+
+
+// def keyboard_prompt(reactor, action):
+  //     while True:
+  //         time.sleep(5)
+  //         action.schedule(None) 
+
+// ### Reactor for handling user keyboard input
+// reactor KeyboardInput {
+//   state th
+//   state terminate = False
+//   state eof = False
+//   state buffer = ""          
+
+//   physical action line       
+//   output prompt
+//   output quit
+
+//   reaction(startup) -> line {=
+//     def reader():
+//       while not self.terminate:
+
+//         s = input("Enter the quiz question\n")
+//         if s == "":                 
+//           self.eof = True
+//           line.schedule(0)         
+//           break
+//         elif s.lower().strip() == "quit":   
+//           self.eof = True
+//           line.schedule(0)
+//           break
+//         else:
+//           self.buffer = s
+//           line.schedule(1)            
+//     self.th = threading.Thread(target=reader, daemon=True)
+//     self.th.start()
+//   =}
+
+//   reaction(line) -> prompt, quit {=
+//     if self.eof:
+//       quit.set()
+//       environment().sync_shutdown()
+//     else:
+//       prompt.set(self.buffer)       
+//   =}
+
+//   reaction(shutdown) {=
+//     self.terminate = True
+//     if self.th and self.th.is_alive():
+//       self.th.join()
+//   =}
+// }
+
+// ### Reactor for calling agent 1 
+// reactor LlmA {
+//   state th
+//   state running = False
+//   state out_buffer = ""
+
+//   input user_in
+//   physical action done   
+//   output answer         
+
+
+//   reaction(user_in) -> done {=
+//     if self.running:
+//       return
+//     self.running = True
+//     query = user_in.value
+//     def agentA():
+//       try:
+//         self.out_buffer = agent1(query)   
+//       finally:
+//         done.schedule(1)              
+//     self.th = threading.Thread(target=agentA, daemon=True)
+//     self.th.start()
+//   =}
+
+//   reaction(done) -> answer {=
+//     self.running = False
+//     answer.set(self.out_buffer)
+//   =}
+// }
+
+
+// ### Reactor for calling agent 2 
+// reactor LlmB {
+//   state th
+//   state running = False
+//   state out_buffer = ""
+//   input user_in
+//   output answer
+
+//   physical action done
+
+//   reaction(user_in)->done{=
+//     if self.running:
+//       return
+//     self.running = True
+//     query = user_in.value
+//     def agentB():
+//       try:
+//         self.out_buffer = agent2(query)
+//       finally:
+//         done.schedule(1)
+//     self.th = threading.Thread(target=agentB, daemon=True)
+//     self.th.start()
+//   =}
+
+//   reaction(done)->answer{=
+//     self.running = False
+//     answer.set(self.out_buffer)
+//   =}
+
+// }
+
+// ###Judge reactor to determine which agent responds first
+// reactor Judge{
+//   input query
+//   input llma
+//   input llmb
+//   output ask           
+
+//   state waiting = False
+//   state logical_base_time = 0
+//   state physical_base_time = 0
+//   state winner = ""
+
+//   logical action timeout(60 sec)
+
+//   reaction(query) -> timeout, ask {=
+//     self.waiting = True
+//     self.winner = ""
+//     self.logical_base_time  = lf.time.logical_elapsed()
+//     self.physical_base_time = lf.time.physical_elapsed()
+//     timeout.schedule(0)
+//     print(f"\n\n\nQuery: {query.value}\n")
+//     print("waiting...\n")
+//     ask.set(query.value)     
+//   =}
+
+//   reaction(llma) {=
+//     if not self.waiting: 
+//       return
+//     self.waiting = False
+//     logical_now  = lf.time.logical_elapsed()
+//     physical_now = lf.time.physical_elapsed()
+//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+//     print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms")
+//     print(f"{llma.value}")
+//   =}
+
+//   reaction(llmb) {=
+//     if not self.waiting: 
+//       return
+//     self.waiting = False
+//     logical_now  = lf.time.logical_elapsed()
+//     physical_now = lf.time.physical_elapsed()
+//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+//     print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms")
+//     print(f"{llmb.value}")
+//   =}
+
+//   reaction(timeout) {=
+//     if not self.waiting: 
+//       return
+//     self.waiting = False
+//     logical_now  = lf.time.logical_elapsed()
+//     physical_now = lf.time.physical_elapsed()
+//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
+//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
+//     print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms")
+//   =}
+// }

From 632dc8eda58f7fd244305353d6898896277733a6 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Tue, 23 Sep 2025 16:37:59 -0700
Subject: [PATCH 09/17] Adding the git ignore file

---
 .gitignore | 6 ++++++
 1 file changed, 6 insertions(+)
 create mode 100644 .gitignore

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..eed972c
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,6 @@
+llm/fed-gen/
+llm/src-gen/
+llm/include/
+llm/bin
+**__pycache__**
+llm/=**
\ No newline at end of file

From 6c8117de13058b11e88771b69dc9bee887b8a335 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 25 Sep 2025 11:58:32 -0700
Subject: [PATCH 10/17] Fixed the issue for the judge federate to receive the
 signal that model is loaded

---
 llm/src/llm_base_class_federate.lf | 140 ++++-------------------------
 llm/src/llm_game_federated.lf      |   7 +-
 2 files changed, 19 insertions(+), 128 deletions(-)

diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf
index c5638d8..235b703 100644
--- a/llm/src/llm_base_class_federate.lf
+++ b/llm/src/llm_base_class_federate.lf
@@ -1,60 +1,5 @@
 target Python 
 
-### Reactor for handling user keyboard input
-
-// reactor KeyboardInput {
-//   state th
-//   state terminate = False
-//   state eof = False
-//   state buffer = ""
-
-//   physical action line
-//   output prompt
-//   output quit
-
-//   reaction(startup) -> line {=
-//     import sys
-//     import threading
-//     import time
-
-//     def reader():
-//       while not self.terminate:
-//         s = input("Enter the quiz question\n")
-//         if s == "":
-//           self.eof = True
-//           try: line.schedule(0)
-//           except Exception as e: print("[keyboard] schedule EOF failed:", e, flush=True)
-//           break
-//         elif s.lower().strip() == "quit":
-//           self.eof = True
-//           try: line.schedule(0)
-//           except Exception as e: print("[keyboard] schedule quit failed:", e, flush=True)
-//           break
-//         else:
-//           self.buffer = s
-//           try: line.schedule(1)   # small logical hop
-//           except Exception as e:
-//             print("[keyboard] schedule failed:", e, flush=True)
-//             break
-//     self.th = threading.Thread(target=reader, daemon=True)
-//     self.th.start()
-//   =}
-
-//   reaction(line) -> prompt, quit {=
-//     if self.eof:
-//       quit.set()
-//       environment().sync_shutdown()
-//     else:
-//       prompt.set(self.buffer)
-//   =}
-
-//   reaction(shutdown) {=
-//     self.terminate = True
-//     if self.th and self.th.is_alive():
-//       self.th.join()
-//   =}
-// }
-
 ### Reactor for calling agent 1 
 reactor LlmA {
   state th
@@ -64,20 +9,22 @@ reactor LlmA {
 
   input  user_in
   physical action done 
-  physical action notify_ready 
+  physical action notify_ready
   output answer
   output ready_out
 
-  reaction(startup) {=
-    import os, sys, importlib.util, threading
+  reaction(startup) -> notify_ready {=
+    import os, sys, importlib.util, threading, traceback
+    act = notify_ready
     def _load():
       try:
         here = os.path.dirname(__file__)
         if here not in sys.path: sys.path.insert(0, here)
         from llm_a import agent1
-        notify_ready.schedule(0)
+        act.schedule(1)
       except Exception as e:
         print("[LlmA] Preload failed:", e, flush=True)
+        traceback.print_exc()
     threading.Thread(target=_load, daemon=True).start()
   =}
 
@@ -118,20 +65,22 @@ reactor LlmB {
 
   input  user_in
   physical action done 
-  physical action notify_ready 
+  physical action notify_ready
   output answer
   output ready_out
 
-  reaction(startup) {=
-    import os, sys, importlib.util, threading
+  reaction(startup) -> notify_ready {=
+    import os, sys, importlib.util, threading, traceback
+    act = notify_ready
     def _load():
       try:
         here = os.path.dirname(__file__)
         if here not in sys.path: sys.path.insert(0, here)
         from llm_b import agent2
-        notify_ready.schedule(0)
+        act.schedule(1)
       except Exception as e:
         print("[LlmB] Preload failed:", e, flush=True)
+        traceback.print_exc()
     threading.Thread(target=_load, daemon=True).start()
   =}
 
@@ -163,65 +112,6 @@ reactor LlmB {
   =}
 }
 // ###Judge reactor to determine which agent responds first
-// reactor Judge{
-//   input query
-//   input llma
-//   input llmb
-//   output ask           
-
-//   state waiting = False
-//   state logical_base_time = 0
-//   state physical_base_time = 0
-//   state winner = ""
-
-//   logical action timeout(60 sec)
-
-//   reaction(query) -> timeout, ask {=
-//     self.waiting = True
-//     self.winner = ""
-//     self.logical_base_time  = lf.time.logical_elapsed()
-//     self.physical_base_time = lf.time.physical_elapsed()
-//     timeout.schedule(0)
-//     print(f"\n\n\nQuery: {query.value}\n")
-//     print("waiting...\n")
-//     ask.set(query.value)     
-//   =}
-
-//   reaction(llma) {=
-//     if not self.waiting: 
-//       return
-//     self.waiting = False
-//     logical_now  = lf.time.logical_elapsed()
-//     physical_now = lf.time.physical_elapsed()
-//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
-//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
-//     print(f" Winner: LLM-A | logical {logical_ms} ms | physical {physical_ms} ms")
-//     print(f"{llma.value}")
-//   =}
-
-//   reaction(llmb) {=
-//     if not self.waiting: 
-//       return
-//     self.waiting = False
-//     logical_now  = lf.time.logical_elapsed()
-//     physical_now = lf.time.physical_elapsed()
-//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
-//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
-//     print(f"Winner: LLM-B | logical {logical_ms} ms | physical {physical_ms} ms")
-//     print(f"{llmb.value}")
-//   =}
-
-//   reaction(timeout) {=
-//     if not self.waiting: 
-//       return
-//     self.waiting = False
-//     logical_now  = lf.time.logical_elapsed()
-//     physical_now = lf.time.physical_elapsed()
-//     logical_ms  = int((logical_now  - self.logical_base_time)  / 1000000)
-//     physical_ms = int((physical_now - self.physical_base_time) / 1000000)
-//     print(f"TIMEOUT (60 s) | logical {logical_ms} ms | physical {physical_ms} ms")
-//   =}
-// }
 
 reactor Judge {
   state th
@@ -245,10 +135,10 @@ reactor Judge {
   output quit
 
   reaction(startup) {=
-    print("[Judge] Waiting for models to load...", flush=True)
+    print("[Judge] Waiting for models to load", flush=True)
   =}
 
-  reaction(ready_a) {=
+  reaction(ready_a)->line {=
     self.a_ready = True
     if self.a_ready and self.b_ready and not self.reader_started:
       import sys, threading
@@ -272,7 +162,7 @@ reactor Judge {
       self.th.start()
   =}
 
-  reaction(ready_b) {=
+  reaction(ready_b)->line {=
     self.b_ready = True
     if self.a_ready and self.b_ready and not self.reader_started:
       import sys, threading
diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf
index 0a4dcd0..5111854 100644
--- a/llm/src/llm_game_federated.lf
+++ b/llm/src/llm_game_federated.lf
@@ -1,9 +1,10 @@
 ### llm.py file needs to be in the same directory
 target Python { keepalive: true, files: ["llm_a.py", "llm_b.py"] }
 // import KeyboardInput from "llm_base_class_federate.lf"
-import LlmA from "llm_base_class_federate.lf"
-import LlmB from "llm_base_class_federate.lf"
-import Judge from "llm_base_class_federate.lf"
+// import LlmA from "llm_base_class_federate.lf"
+// import LlmB from "llm_base_class_federate.lf"
+// import Judge from "llm_base_class_federate.lf"
+import LlmA, LlmB, Judge from "llm_base_class_federate.lf"
 
 preamble {=
   import threading

From 2f1a884b43f59d6cc7af7819171a3a90b4452856 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 25 Sep 2025 16:09:09 -0700
Subject: [PATCH 11/17] Added the version of files for running on different
 devices

---
 llm/src/llm_a.py                   |  10 +--
 llm/src/llm_b.py                   |  10 +--
 llm/src/llm_b_m2.py                | 102 +++++++++++++++++++++++++++++
 llm/src/llm_base_class_federate.lf |   8 +--
 llm/src/llm_game_federated.lf      |  26 ++------
 5 files changed, 123 insertions(+), 33 deletions(-)
 create mode 100644 llm/src/llm_b_m2.py

diff --git a/llm/src/llm_a.py b/llm/src/llm_a.py
index df5faf3..15411cd 100644
--- a/llm/src/llm_a.py
+++ b/llm/src/llm_a.py
@@ -1,4 +1,4 @@
-# llm_a.py — Agent 1 (7B)
+# llm_a.py
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
@@ -6,10 +6,10 @@
 # <<< put your token here >>>
 hf_auth = "add token here "
 
-# Model to be chosen to act as an agent
+# Model 
 model_id = "meta-llama/Llama-2-7b-chat-hf"
 
-# Require GPU (you said it must work only on GPU)
+# Require GPU 
 has_cuda = torch.cuda.is_available()
 if not has_cuda:
     raise RuntimeError("CUDA GPU required for this configuration.")
@@ -19,7 +19,7 @@
 bnb_config = None
 if has_cuda:
     try:
-        import bitsandbytes as bnb  # noqa: F401
+        import bitsandbytes as bnb 
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
@@ -63,7 +63,7 @@ def postprocess(text: str) -> str:
             break
     return t.strip().strip(":").strip()
 
-# Agent 1 entrypoint
+# Agent 1 
 def agent1(q: str) -> str:
     prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
     inputs = tokenizer(prompt, return_tensors="pt")
diff --git a/llm/src/llm_b.py b/llm/src/llm_b.py
index 513d6c2..6acb7d9 100644
--- a/llm/src/llm_b.py
+++ b/llm/src/llm_b.py
@@ -1,5 +1,5 @@
 
-# llm_b.py — Agent 2 (70B)
+# llm_b.py 
 
 import torch
 from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
@@ -7,10 +7,10 @@
 # <<< put your token here >>>
 hf_auth = "add token here"
 
-# Model to be chosen to act as an agent
+# Model 
 model_id_2 = "meta-llama/Llama-2-70b-chat-hf"
 
-# Require GPU (GPU-only)
+# Require GPU 
 has_cuda = torch.cuda.is_available()
 if not has_cuda:
     raise RuntimeError("CUDA GPU required for this configuration.")
@@ -20,7 +20,7 @@
 bnb_config = None
 if has_cuda:
     try:
-        import bitsandbytes as bnb  # noqa: F401
+        import bitsandbytes as bnb  
         bnb_config = BitsAndBytesConfig(
             load_in_4bit=True,
             bnb_4bit_quant_type="nf4",
@@ -64,7 +64,7 @@ def postprocess(text: str) -> str:
             break
     return t.strip().strip(":").strip()
 
-# Agent 2 entrypoint
+# Agent 2 
 def agent2(q: str) -> str:
     prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
     inputs = tokenizer_2(prompt, return_tensors="pt")
diff --git a/llm/src/llm_b_m2.py b/llm/src/llm_b_m2.py
new file mode 100644
index 0000000..45bad45
--- /dev/null
+++ b/llm/src/llm_b_m2.py
@@ -0,0 +1,102 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
+
+hf_auth = "add your token here"
+
+model_id_2 = "google/gemma-3-270m"
+
+has_cuda = torch.cuda.is_available()
+has_mps  = torch.backends.mps.is_available()
+
+if has_cuda:
+    device = torch.device("cuda")
+    compute_dtype = torch.float16
+elif has_mps:
+    device = torch.device("mps")
+    compute_dtype = torch.float32
+else:
+    device = torch.device("cpu")
+    compute_dtype = torch.float32
+
+
+common = dict(
+    low_cpu_mem_usage=True,
+    attn_implementation="eager",
+)
+
+#4-bit on CUDA if the device has it 
+if has_cuda:
+    try:
+        import bitsandbytes as bnb  
+        common["quantization_config"] = BitsAndBytesConfig(
+            load_in_4bit=True,
+            bnb_4bit_quant_type="nf4",
+            bnb_4bit_use_double_quant=True,
+            bnb_4bit_compute_dtype=compute_dtype,
+        )
+        common["device_map"] = "auto"
+    except Exception:
+        print("[WARN] bitsandbytes not available; using full-precision fp16 on CUDA.", flush=True)
+        common["device_map"] = "auto"
+else:
+    common["device_map"] = None
+
+# Tokenizer 
+tok_kwargs = dict(use_fast=True)
+if hf_auth:
+    tok_kwargs["token"] = hf_auth
+tokenizer_2 = AutoTokenizer.from_pretrained(model_id_2, **tok_kwargs)
+if tokenizer_2.pad_token_id is None:
+    tokenizer_2.pad_token = tokenizer_2.eos_token
+
+# Model
+mp_kwargs = dict(dtype=compute_dtype, **common)
+if hf_auth:
+    mp_kwargs["token"] = hf_auth
+
+model_2 = AutoModelForCausalLM.from_pretrained(model_id_2, **mp_kwargs)
+if not has_cuda: 
+    model_2.to(device)
+model_2.eval()
+
+# Greedy decoding 
+GEN_B = dict(
+    max_new_tokens=32,
+    do_sample=True,
+    eos_token_id=tokenizer_2.eos_token_id,
+    pad_token_id=tokenizer_2.pad_token_id,
+)
+
+def postprocess(text: str) -> str:
+    t = text.strip()
+    for sep in ["\n", ". ", "  "]:
+        i = t.find(sep)
+        if i > 0:
+            t = t[:i]
+            break
+    return t.strip().strip(":").strip()
+
+def agent2(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer_2(prompt, return_tensors="pt")
+
+    if has_cuda:
+        inputs = {k: v.to("cuda") for k, v in inputs.items()}
+    elif has_mps:
+        inputs = {k: v.to("mps") for k, v in inputs.items()}
+    else:
+        inputs = {k: v.to("cpu") for k, v in inputs.items()}
+
+    with torch.inference_mode():
+        out = model_2.generate(**inputs, **GEN_B)
+
+    prompt_len = inputs["input_ids"].shape[1]
+    result = tokenizer_2.decode(out[0][prompt_len:], skip_special_tokens=True)
+    print(result)
+    return postprocess(result)
+
+# def main():
+#     agent2("what is AI?")
+
+# if __name__ == "__main__":
+#     main()
\ No newline at end of file
diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf
index 235b703..194d49a 100644
--- a/llm/src/llm_base_class_federate.lf
+++ b/llm/src/llm_base_class_federate.lf
@@ -56,7 +56,7 @@ reactor LlmA {
   =}
 }
 
-// ### Reactor for calling agent 2 
+### Reactor for calling agent 2 
 reactor LlmB {
   state th
   state running = False
@@ -76,7 +76,7 @@ reactor LlmB {
       try:
         here = os.path.dirname(__file__)
         if here not in sys.path: sys.path.insert(0, here)
-        from llm_b import agent2
+        from llm_b_m2 import agent2
         act.schedule(1)
       except Exception as e:
         print("[LlmB] Preload failed:", e, flush=True)
@@ -95,7 +95,7 @@ reactor LlmB {
     if self.running: return
     self.running = True
     q = user_in.value
-    from llm_b import agent2
+    from llm_b_m2 import agent2
     def agentB():
       try:
         self.out_buffer = agent2(q)
@@ -111,7 +111,7 @@ reactor LlmB {
     answer.set(self.out_buffer)
   =}
 }
-// ###Judge reactor to determine which agent responds first
+###Judge reactor to determine which agent responds first
 
 reactor Judge {
   state th
diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf
index 5111854..d2b0834 100644
--- a/llm/src/llm_game_federated.lf
+++ b/llm/src/llm_game_federated.lf
@@ -1,33 +1,21 @@
 ### llm.py file needs to be in the same directory
-target Python { keepalive: true, files: ["llm_a.py", "llm_b.py"] }
-// import KeyboardInput from "llm_base_class_federate.lf"
-// import LlmA from "llm_base_class_federate.lf"
-// import LlmB from "llm_base_class_federate.lf"
-// import Judge from "llm_base_class_federate.lf"
+target Python { keepalive: true, files: ["llm_a.py", "llm_b_m2.py" ] } #"llm_b.py"
+
 import LlmA, LlmB, Judge from "llm_base_class_federate.lf"
 
 preamble {=
   import threading
   import time 
   from llm_a import agent1
-  from llm_b import agent2
+  from llm_b_m2 import agent2
 =}
 
 
 federated reactor llm_game_federated at 10.218.100.95 {
-  // llma_response_f = new LlmA() 
-  // llmb_response_f = new LlmB() 
-  // keyboard_f = new KeyboardInput() 
-  // j_f = new Judge() 
-
-  // keyboard_f.prompt -> j_f.query
-  // j_f.ask -> llma_response_f.user_in
-  // j_f.ask -> llmb_response_f.user_in
-  // llma_response_f.answer -> j_f.llma
-  // llmb_response_f.answer -> j_f.llmb         
-  j    = new Judge()
-  llma = new LlmA()
-  llmb = new LlmB()
+       
+  j    = new Judge() 
+  llma = new LlmA() 
+  llmb = new LlmB() 
 
   j.ask -> llma.user_in
   j.ask -> llmb.user_in

From 1958fbb8aceeff3634d074a8ba8d3452e6e4337d Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 9 Oct 2025 11:01:23 -0700
Subject: [PATCH 12/17] Adding a python script for llama 3.2 1B for jetson orin

---
 llm/src/llm_b_jetson.py | 52 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 llm/src/llm_b_jetson.py

diff --git a/llm/src/llm_b_jetson.py b/llm/src/llm_b_jetson.py
new file mode 100644
index 0000000..7dc94fa
--- /dev/null
+++ b/llm/src/llm_b_jetson.py
@@ -0,0 +1,52 @@
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+
+hf_auth = ""
+
+model_id = "meta-llama/Llama-3.2-1B"
+
+has_cuda = torch.cuda.is_available()
+device = torch.device("cuda" if has_cuda else "cpu")
+compute_dtype = torch.float16 if has_cuda else torch.float32
+
+common = dict(
+    low_cpu_mem_usage=True,
+    attn_implementation="eager",
+)
+
+tok_kwargs = dict(use_fast=True)
+if hf_auth:
+    tok_kwargs["token"] = hf_auth
+
+tokenizer = AutoTokenizer.from_pretrained(model_id, **tok_kwargs)
+if tokenizer.pad_token_id is None:
+    tokenizer.pad_token = tokenizer.eos_token
+
+mp_kwargs = dict(torch_dtype=compute_dtype, **common)
+if hf_auth:
+    mp_kwargs["token"] = hf_auth
+
+model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs)
+model.to(device)
+model.eval()
+
+GEN = dict(
+    max_new_tokens=64,
+    do_sample=True,
+    temperature=0.7,
+    top_p=0.95,
+    eos_token_id=tokenizer.eos_token_id,
+    pad_token_id=tokenizer.pad_token_id,
+)
+
+def agent2(q: str) -> str:
+    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+    inputs = tokenizer(prompt, return_tensors="pt").to(device)
+    with torch.inference_mode():
+        out = model.generate(**inputs, **GEN)
+    gen = out[0, inputs["input_ids"].shape[1]:]
+    return tokenizer.decode(gen, skip_special_tokens=True).strip()
+
+if __name__ == "__main__":
+    question = "What is the capital of Japan?"
+    print(agent2(question))

From 60f642d11f09fc6f49a9c3fce21401f743f07674 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 9 Oct 2025 12:27:06 -0700
Subject: [PATCH 13/17] commented the code for testing

---
 llm/src/llm_b_jetson.py | 87 ++++++++++++++++++++++-------------------
 1 file changed, 46 insertions(+), 41 deletions(-)

diff --git a/llm/src/llm_b_jetson.py b/llm/src/llm_b_jetson.py
index 7dc94fa..40461ed 100644
--- a/llm/src/llm_b_jetson.py
+++ b/llm/src/llm_b_jetson.py
@@ -1,52 +1,57 @@
-import torch
-from transformers import AutoModelForCausalLM, AutoTokenizer
+# import torch
+# from transformers import AutoModelForCausalLM, AutoTokenizer
 
-hf_auth = ""
+# hf_auth = ""
 
-model_id = "meta-llama/Llama-3.2-1B"
+# model_id = "meta-llama/Llama-3.2-1B"
 
-has_cuda = torch.cuda.is_available()
-device = torch.device("cuda" if has_cuda else "cpu")
-compute_dtype = torch.float16 if has_cuda else torch.float32
+# has_cuda = torch.cuda.is_available()
+# device = torch.device("cuda" if has_cuda else "cpu")
+# compute_dtype = torch.float16 if has_cuda else torch.float32
 
-common = dict(
-    low_cpu_mem_usage=True,
-    attn_implementation="eager",
-)
+# common = dict(
+#     low_cpu_mem_usage=True,
+#     attn_implementation="eager",
+# )
 
-tok_kwargs = dict(use_fast=True)
-if hf_auth:
-    tok_kwargs["token"] = hf_auth
+# tok_kwargs = dict(use_fast=True)
+# if hf_auth:
+#     tok_kwargs["token"] = hf_auth
 
-tokenizer = AutoTokenizer.from_pretrained(model_id, **tok_kwargs)
-if tokenizer.pad_token_id is None:
-    tokenizer.pad_token = tokenizer.eos_token
+# tokenizer = AutoTokenizer.from_pretrained(model_id, **tok_kwargs)
+# if tokenizer.pad_token_id is None:
+#     tokenizer.pad_token = tokenizer.eos_token
 
-mp_kwargs = dict(torch_dtype=compute_dtype, **common)
-if hf_auth:
-    mp_kwargs["token"] = hf_auth
+# mp_kwargs = dict(torch_dtype=compute_dtype, **common)
+# if hf_auth:
+#     mp_kwargs["token"] = hf_auth
 
-model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs)
-model.to(device)
-model.eval()
+# model = AutoModelForCausalLM.from_pretrained(model_id, **mp_kwargs)
+# model.to(device)
+# model.eval()
+
+# GEN = dict(
+#     max_new_tokens=64,
+#     do_sample=True,
+#     temperature=0.7,
+#     top_p=0.95,
+#     eos_token_id=tokenizer.eos_token_id,
+#     pad_token_id=tokenizer.pad_token_id,
+# )
+
+# def agent2(q: str) -> str:
+#     prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
+#     inputs = tokenizer(prompt, return_tensors="pt").to(device)
+#     with torch.inference_mode():
+#         out = model.generate(**inputs, **GEN)
+#     gen = out[0, inputs["input_ids"].shape[1]:]
+#     return tokenizer.decode(gen, skip_special_tokens=True).strip()
+
+# if __name__ == "__main__":
+#     question = "What is the capital of Japan?"
+#     print(agent2(question))
 
-GEN = dict(
-    max_new_tokens=64,
-    do_sample=True,
-    temperature=0.7,
-    top_p=0.95,
-    eos_token_id=tokenizer.eos_token_id,
-    pad_token_id=tokenizer.pad_token_id,
-)
 
 def agent2(q: str) -> str:
-    prompt = f"You are a concise Q&A assistant.\n\n{q}\n"
-    inputs = tokenizer(prompt, return_tensors="pt").to(device)
-    with torch.inference_mode():
-        out = model.generate(**inputs, **GEN)
-    gen = out[0, inputs["input_ids"].shape[1]:]
-    return tokenizer.decode(gen, skip_special_tokens=True).strip()
-
-if __name__ == "__main__":
-    question = "What is the capital of Japan?"
-    print(agent2(question))
+    
+    return "Hello this is jetson"
\ No newline at end of file

From 6a26cab3fc73cff14795fee19cdd107777949023 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 9 Oct 2025 12:29:42 -0700
Subject: [PATCH 14/17] Testing Jetson

---
 llm/src/llm_game_federated.lf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf
index d2b0834..863abca 100644
--- a/llm/src/llm_game_federated.lf
+++ b/llm/src/llm_game_federated.lf
@@ -1,5 +1,5 @@
 ### llm.py file needs to be in the same directory
-target Python { keepalive: true, files: ["llm_a.py", "llm_b_m2.py" ] } #"llm_b.py"
+target Python { keepalive: true, files: ["llm_a.py", "llm_b_jetson.py" ] } #"llm_b.py"
 
 import LlmA, LlmB, Judge from "llm_base_class_federate.lf"
 

From aef0ac957c855200119bbab82a2e7c8dce735c59 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 9 Oct 2025 12:43:07 -0700
Subject: [PATCH 15/17] Changed the file names in base class

---
 llm/src/llm_base_class_federate.lf | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/src/llm_base_class_federate.lf b/llm/src/llm_base_class_federate.lf
index 194d49a..14412a2 100644
--- a/llm/src/llm_base_class_federate.lf
+++ b/llm/src/llm_base_class_federate.lf
@@ -76,7 +76,7 @@ reactor LlmB {
       try:
         here = os.path.dirname(__file__)
         if here not in sys.path: sys.path.insert(0, here)
-        from llm_b_m2 import agent2
+        from llm_b_jetson import agent2
         act.schedule(1)
       except Exception as e:
         print("[LlmB] Preload failed:", e, flush=True)
@@ -95,7 +95,7 @@ reactor LlmB {
     if self.running: return
     self.running = True
     q = user_in.value
-    from llm_b_m2 import agent2
+    from llm_b_jetson import agent2
     def agentB():
       try:
         self.out_buffer = agent2(q)

From c4c635372ea379070969773de15f8d95a1ac3e16 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 9 Oct 2025 12:49:56 -0700
Subject: [PATCH 16/17] Changed the RTI to jetson

---
 llm/src/llm_game_federated.lf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf
index 863abca..3a9f677 100644
--- a/llm/src/llm_game_federated.lf
+++ b/llm/src/llm_game_federated.lf
@@ -11,7 +11,7 @@ preamble {=
 =}
 
 
-federated reactor llm_game_federated at 10.218.100.95 {
+federated reactor llm_game_federated at 10.155.214.175 {
        
   j    = new Judge() 
   llma = new LlmA() 

From 9d503d53028c700ac9a00f1ec33b851a424fb407 Mon Sep 17 00:00:00 2001
From: Deeksha-20-99 <deekshaprahlad@gmail.com>
Date: Thu, 9 Oct 2025 14:12:41 -0700
Subject: [PATCH 17/17] corrected the ip for jetson orin

---
 llm/src/llm_game_federated.lf | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/src/llm_game_federated.lf b/llm/src/llm_game_federated.lf
index 3a9f677..d2b745c 100644
--- a/llm/src/llm_game_federated.lf
+++ b/llm/src/llm_game_federated.lf
@@ -11,7 +11,7 @@ preamble {=
 =}
 
 
-federated reactor llm_game_federated at 10.155.214.175 {
+federated reactor llm_game_federated at 10.155.241.175 {
        
   j    = new Judge() 
   llma = new LlmA()