From 874e5b18180f3af6c733bb2897405bf1c6f12267 Mon Sep 17 00:00:00 2001
From: Berkay <berkay.bayar21@outlook.com>
Date: Fri, 15 Aug 2025 11:51:57 +0300
Subject: [PATCH] added ollama support.

---
 README.md    |  24 ++++-
 ollama_dj.py | 266 +++++++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 287 insertions(+), 3 deletions(-)
 create mode 100644 ollama_dj.py
diff --git a/README.md b/README.md
index 8208092..57a1e97 100644
--- a/README.md
+++ b/README.md
@@ -20,6 +20,7 @@ Infinite Radio generates endless music that automatically changes based on your
 ## Prerequisites
 
 For running the music model locally, you will need:
+
 - **Docker** with GPU support
 - **NVIDIA GPU** with CUDA support
 - **[NVIDIA Container Toolkit](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html)**
@@ -27,6 +28,7 @@ For running the music model locally, you will need:
 ## Music Model
 
 1. **Run the Docker Container from [Dockerhub](https://hub.docker.com/repository/docker/lauriewired/musicbeats/general):**
+
    ```bash
    docker run --gpus all --network host lauriewired/musicbeats:latest
    ```
@@ -34,7 +36,7 @@ For running the music model locally, you will need:
 2. **Access the web interface:**
    - Open your browser and navigate to `http://127.0.0.1:8080` or the IP where the music container is running
    - Click the play button to start streaming
-  
+
 ## Running a DJ
 
 ## Option 1: Running the DJ on MacOS
@@ -42,6 +44,7 @@ For running the music model locally, you will need:
 The Mac application can start the Process DJ or connect to the LLM DJ. It lives as a tray application to easily configure and examine the music control. **Note:** When using the Mac application, you may need to provide additional permissions to allow the DJ to examine your screen to dynamically select the genre.
 
 1. **Download the latest release:**
+
    - Go to the releases page and download the [latest version](https://github.com/LaurieWired/InfiniteRadio/releases/download/v1.0/InfiniteRadio.zip)
    - Run the .app file and Infinite Radio will appear in your tray
 
@@ -49,7 +52,7 @@ The Mac application can start the Process DJ or connect to the LLM DJ. It lives
 
 3. **Select and run your DJ of choice**
    - You can run the process DJ immediately or choose the LLM DJ
-   - If selecting the LLM DJ, ensure the model server is running already in [LM Studio](https://lmstudio.ai) (See *Option 3* below for an example although you may skip the python step when using the Mac app)
+   - If selecting the LLM DJ, ensure the model server is running already in [LM Studio](https://lmstudio.ai) (See _Option 3_ below for an example although you may skip the python step when using the Mac app)
 
 ## Option 2: Running Process DJ with Python
 
@@ -66,7 +69,7 @@ The LLM DJ analyzes the data on your screen to automatically configure the genre
 1. **Run the LLM in LM Studio:**
    - Download [InternVL3](https://huggingface.co/OpenGVLab/InternVL3-2B) (or any image to text model)
    - Start the server in LM Studio
-  
+
 <img src="images/lm_studio.png" alt="lm_studio" width="400"/>
 
 2. **Run the Python Connection:**
@@ -74,6 +77,21 @@ The LLM DJ analyzes the data on your screen to automatically configure the genre
    python llm_dj.py 127.0.0.1 8080 # Point this to the IP and port of the music model
    ```
 
+## Option 4: Running the LLM DJ with Ollama (Alternative)
+
+The Ollama DJ provides the same functionality as LM Studio but with a lighter, easier-to-use local LLM server.
+
+1. **Pull the InternVL3 model:**
+
+   ```bash
+   ollama run hf.co/mradermacher/InternVL3-2B-GGUF:Q8_0
+   ```
+
+2. **Run the Python Connection:**
+   ```bash
+   python ollama_dj.py 127.0.0.1 8080 # Point this to the IP and port of the music model
+   ```
+
 # API Reference
 
 ## Change Genre
diff --git a/ollama_dj.py b/ollama_dj.py
new file mode 100644
index 0000000..34493af
--- /dev/null
+++ b/ollama_dj.py
@@ -0,0 +1,266 @@
+#!/usr/bin/env python3
+"""
+Ollama DJ - Uses Ollama to determine music genre based on activity
+Alternative to llm_dj.py that works with Ollama instead of LM Studio
+"""
+
+import time
+import sys
+import requests
+import argparse
+import json
+import base64
+from io import BytesIO
+from PIL import Image
+import mss
+from openai import OpenAI
+
+
+def examine_activity(debug=False, monitor_index=0):
+    """Take a screenshot of the current screen and return it as a base64 encoded string."""
+    try:
+        with mss.mss() as sct:
+            # Take screenshot to share to the LLM
+            # monitor_index 0 = all monitors combined, 1+ = specific monitor
+            if monitor_index >= len(sct.monitors):
+                print(f"   WARNING: Monitor {monitor_index} not found, using all monitors")
+                monitor_index = 0
+            
+            monitor = sct.monitors[monitor_index]
+            if monitor_index == 0:
+                print(f"   Examining all monitors combined")
+            else:
+                print(f"   Examining monitor {monitor_index}")
+            screenshot = sct.grab(monitor)
+            
+            # Convert to PIL Image
+            img = Image.frombytes("RGB", screenshot.size, screenshot.bgra, "raw", "BGRX")
+            
+            # Resize image to reduce file size (optional, but recommended for LLM processing)
+            # Keep aspect ratio but limit max dimension to 1024px
+            max_size = 1024
+            if img.width > max_size or img.height > max_size:
+                img.thumbnail((max_size, max_size), Image.Resampling.LANCZOS)
+            
+            if debug:
+                print("   DEBUG: Opening screenshot preview...")
+                img.show()
+            
+            # Convert to base64
+            buffer = BytesIO()
+            img.save(buffer, format="PNG")
+            img_str = base64.b64encode(buffer.getvalue()).decode()
+            
+            return img_str
+    except Exception as e:
+        print(f"ERROR: Failed to take screenshot: {e}")
+        return None
+
+
+def get_genre_from_ollama(client, model_name, screenshot_b64):
+    """Use Ollama to get music genre from screenshot."""
+    try:
+        print(f"-> Analyzing activity with Ollama model '{model_name}'...")
+        
+        # Request JSON output via optimized system prompt
+        response = client.chat.completions.create(
+            model=model_name,
+            messages=[
+                {
+                    "role": "system",
+                    "content": "### SYSTEM\nYou are given one image.\n\n### INSTRUCTION\n1. Silently infer what the user is doing in the screenshot.\n2. Pick one 1-2-word music genre that fits the activity.\n   *Think step-by-step internally only.*\n3. Return a JSON object that conforms to the provided schema.\n   **Do not output anything else.**\n\n### RESPONSE FORMAT\n{\"music_genre\": \"<genre>\"}"
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": f"data:image/png;base64,{screenshot_b64}"
+                            }
+                        }
+                    ]
+                }
+            ],
+            max_tokens=50,
+            temperature=0.0
+        )
+        
+        content = response.choices[0].message.content
+        
+        try:
+            # First, strip any markdown code blocks that might wrap the JSON
+            import re
+            # Remove ```json and ``` markers
+            cleaned_content = re.sub(r'^```(?:json)?\s*\n?', '', content.strip(), flags=re.MULTILINE)
+            cleaned_content = re.sub(r'\n?```\s*$', '', cleaned_content, flags=re.MULTILINE)
+            
+            genre_data = json.loads(cleaned_content.strip())
+            if "music_genre" in genre_data and isinstance(genre_data["music_genre"], str):
+                return genre_data["music_genre"]
+            else:
+                print(f"   WARNING: 'music_genre' key missing or invalid in Ollama response: {content}")
+                return None
+        except json.JSONDecodeError:
+            print(f"   WARNING: Could not parse JSON from Ollama response: {content}")
+            # Try to find JSON-like pattern in the text as fallback
+            import re
+            match = re.search(r'\{"music_genre":\s*"([^"]+)"\}', content)
+            if match:
+                return match.group(1)
+            return None
+        
+    except Exception as e:
+        print(f"   ERROR: Failed to get genre from Ollama: {e}")
+        return None
+
+
+def change_server_genre(server_ip, server_port, genre):
+    """Sends a POST request to the music server to change the genre."""
+    url = f"http://{server_ip}:{server_port}/genre"
+    payload = {"genre": genre}
+    print(f"-> Attempting to change genre to '{genre}'...")
+    try:
+        response = requests.post(url, json=payload, timeout=5)
+        response.raise_for_status()
+        print(f"   SUCCESS: Genre changed to '{response.json().get('genre', genre)}'.")
+        return True
+    except requests.exceptions.RequestException as e:
+        print(f"   ERROR: Could not connect to the music server at {url}. Details: {e}")
+        return False
+
+
+def check_ollama_connection(ollama_url, model_name):
+    """Check if Ollama is running and the model is available."""
+    try:
+        # Check if Ollama is running
+        response = requests.get(f"{ollama_url}/api/tags", timeout=5)
+        response.raise_for_status()
+        
+        # Check if the model is available
+        models = response.json().get("models", [])
+        model_names = [model.get("name", "") for model in models]
+        
+        if model_name not in model_names:
+            print(f"   WARNING: Model '{model_name}' not found in Ollama.")
+            print(f"   Available models: {', '.join(model_names)}")
+            print(f"   You can pull the model with: ollama pull {model_name}")
+            return False
+        
+        print(f"   SUCCESS: Ollama is running and model '{model_name}' is available.")
+        return True
+        
+    except requests.exceptions.RequestException as e:
+        print(f"   ERROR: Could not connect to Ollama at {ollama_url}. Details: {e}")
+        print(f"   Make sure Ollama is running with: ollama serve")
+        return False
+
+
+def main(args):
+    """Main loop to take screenshots, get genre suggestions, and update music."""
+    ollama_url = f"http://{args.ollama_host}:{args.ollama_port}"
+    
+    print("--- Ollama DJ Starting ---")
+    print(f"Screen Activity Analysis every {args.interval} seconds")
+    print(f"Ollama URL: {ollama_url}")
+    print(f"Ollama Model: {args.model}")
+    print(f"Music Server: http://{args.music_ip}:{args.music_port}/genre")
+    
+    # Show monitor info
+    try:
+        with mss.mss() as sct:
+            if args.monitor == 0:
+                print(f"Monitor: All monitors combined")
+            elif args.monitor < len(sct.monitors):
+                monitor = sct.monitors[args.monitor]
+                print(f"Monitor: Monitor {args.monitor} ({monitor['width']}x{monitor['height']})")
+            else:
+                print(f"Monitor: {args.monitor} (will fallback to all monitors)")
+    except Exception as e:
+        print(f"Monitor: Unable to detect monitor info - {e}")
+    
+    print("Press Ctrl+C to stop.")
+    
+    # Check Ollama connection and model availability
+    if not check_ollama_connection(ollama_url, args.model):
+        print("   Exiting due to Ollama connection issues.")
+        sys.exit(1)
+    
+    # Initialize the OpenAI client to point to Ollama
+    client = OpenAI(
+        base_url=f"{ollama_url}/v1", 
+        api_key="ollama"  # Ollama doesn't require a real API key
+    )
+    
+    last_genre = None
+    
+    try:
+        while True:
+            print(f"\n--- Screen Activity Analysis cycle at {time.strftime('%H:%M:%S')} ---")
+            
+            # Take screenshot
+            screenshot_b64 = examine_activity(debug=args.debug, monitor_index=args.monitor)
+            if not screenshot_b64:
+                print("   Skipping this cycle due to screenshot failure.")
+                time.sleep(args.interval)
+                continue
+            
+            # Pass the client instance to the function
+            suggested_genre = get_genre_from_ollama(client, args.model, screenshot_b64)
+            if not suggested_genre:
+                print("   No genre suggestion received from Ollama.")
+                time.sleep(args.interval)
+                continue
+            
+            print(f"   Ollama suggested genre: '{suggested_genre}'")
+            
+            # Only change if it's different from the last genre
+            if suggested_genre.lower() != str(last_genre).lower():
+                if change_server_genre(args.music_ip, args.music_port, suggested_genre):
+                    last_genre = suggested_genre
+                else:
+                    print("   Failed to change genre on music server.")
+            else:
+                print("   Genre unchanged, skipping server update.")
+            
+            # Wait for next cycle
+            time.sleep(args.interval)
+            
+    except KeyboardInterrupt:
+        print("\n--- Ollama DJ Stopping ---")
+    except Exception as e:
+        print(f"\nAn unexpected error occurred: {e}")
+        sys.exit(1)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Uses Ollama to determine music genre from screen activity."
+    )
+    parser.add_argument("music_ip", help="IP address of the music server")
+    parser.add_argument("music_port", type=int, help="Port of the music server")
+    parser.add_argument("--model", default="hf.co/mradermacher/InternVL3-2B-GGUF:Q8_0", help="Ollama model to use (default: 'hf.co/mradermacher/InternVL3-2B-GGUF:Q8_0')")
+    parser.add_argument("--ollama-host", default="localhost", help="Ollama host (default: localhost)")
+    parser.add_argument("--ollama-port", type=int, default=11434, help="Ollama port (default: 11434)")
+    parser.add_argument("--interval", type=int, default=10, help="Interval in seconds between screen analysis (default: 10)")
+    parser.add_argument("--monitor", type=int, default=1, help="Monitor to capture (0=all monitors, 1=first monitor, 2=second monitor, etc.)")
+    parser.add_argument("--list-monitors", action="store_true", help="List available monitors and exit")
+    parser.add_argument("--debug", action="store_true", help="Show screenshot preview before sending to Ollama")
+    
+    parsed_args = parser.parse_args()
+    
+    # Handle monitor listing
+    if parsed_args.list_monitors:
+        print("Available monitors:")
+        try:
+            with mss.mss() as sct:
+                for i, monitor in enumerate(sct.monitors):
+                    if i == 0:
+                        print(f"  {i}: All monitors combined ({monitor['width']}x{monitor['height']})")
+                    else:
+                        print(f"  {i}: Monitor {i} ({monitor['width']}x{monitor['height']} at {monitor['left']},{monitor['top']})")
+        except Exception as e:
+            print(f"Error listing monitors: {e}")
+        sys.exit(0)
+    
+    main(parsed_args)