Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

openai compatible worker #314

Draft
wants to merge 30 commits into
base: main
Choose a base branch
from
Draft
Changes from 1 commit
Commits
Show all changes
30 commits
Select commit Hold shift + click to select a range
1720b78
LiveKit Pipeline Agent (#4)
benxu3 Nov 26, 2024
de2a7cb
add token in qr
benxu3 Nov 26, 2024
76f0847
fix dependency issues
benxu3 Nov 26, 2024
37198d8
Merge branch 'main' of https://github.com/benxu3/01
benxu3 Nov 26, 2024
2f53be0
update livekit server and profile docs
benxu3 Dec 9, 2024
f6c13a1
send multimodal message on startup of multimodal agent
benxu3 Dec 9, 2024
07672f4
add voice assistant state communication and clear chat context
benxu3 Dec 9, 2024
178ffc8
update logging with debug env variable
benxu3 Dec 9, 2024
c2de04a
update profiles to be compatible with new interpreter
benxu3 Dec 9, 2024
bba33db
update server to use new interpreter
benxu3 Dec 9, 2024
ba2813d
upgrade interpreter and livekit agents
benxu3 Dec 9, 2024
cedda96
use participant token in meet_url
benxu3 Dec 9, 2024
8f6d5fd
remove assistant fnc
benxu3 Dec 9, 2024
4e77a57
remove duplicate fnc_ctx declaration
benxu3 Dec 9, 2024
84e05db
add local setup docs
benxu3 Dec 30, 2024
095b704
add basic interrupt logic
benxu3 Dec 30, 2024
6084e25
refactor logging outside logic
benxu3 Dec 30, 2024
bd6f530
replace hosted livekit meet with local meet link
benxu3 Dec 30, 2024
6110e70
add local stt & tts, add anticipation logic, remove video context acc…
benxu3 Dec 30, 2024
4c271b1
remove separate transcriptions
benxu3 Dec 30, 2024
f68f83c
update local and default profile
benxu3 Dec 30, 2024
3f6ba52
add meet flag and better error handling
benxu3 Dec 30, 2024
7207add
run worker in dev mode
benxu3 Dec 30, 2024
0c6a2cd
fix error on local tts docs
benxu3 Dec 31, 2024
f989731
move tts and stt to 9001 and 9002
benxu3 Dec 31, 2024
ab8055e
draft main cli
benxu3 Jan 1, 2025
a2f86af
make request based on updated chat ctx in anticipation
benxu3 Jan 1, 2025
ce52aa6
fix cli bug in main
benxu3 Jan 1, 2025
16fb2b3
remove test.py
benxu3 Jan 1, 2025
8c89960
revert anticipation to default
benxu3 Jan 1, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add basic interrupt logic
benxu3 committed Dec 30, 2024

Verified

This commit was created on GitHub.com and signed with GitHub’s verified signature.
commit 095b704da488b47716dbc0531e6f166f41e40c01
157 changes: 157 additions & 0 deletions software/source/server/livekit/anticipation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,157 @@
from typing import Any, Dict
import json
import base64
import traceback
import io
from PIL import Image as PIL_Image

from openai import OpenAI
from livekit.agents.llm import ChatContext
from livekit import rtc
from livekit.agents.pipeline import VoicePipelineAgent
from livekit.agents.llm.chat_context import ChatContext
from source.server.livekit.logger import log_message



# Add these constants after the existing ones
INSTRUCTIONS_PROMPT = """Given the conversation context and the current video frame, evaluate if any instructions have been violated.
Rate the severity of violation from 0-10, where 10 is most severe.
Instructions to check:
1. Ensure that the screenshot is NOT YOUTUBE or other video content
Respond in the following JSON format:
{
"violation_detected": boolean,
"severity_rating": number,
"violation_summary": string,
"recommendations": string
}
"""


# Add this function to handle safety check callbacks
async def handle_instruction_check(
assistant: VoicePipelineAgent,
video_frame: rtc.VideoFrame,
):
"""Handle safety check callback from video processor"""
log_message("Starting instruction check process...")

try:
log_message("Calling check_instruction_violation...")
result = await check_instruction_violation(
chat_ctx=assistant.chat_ctx,
video_frame=video_frame,
)

log_message(f"Instruction check result: {json.dumps(result, indent=2)}")

if result["violation_detected"] and result["severity_rating"] >= 7:
log_message(f"Violation detected with severity {result['severity_rating']}, triggering assistant response")

# Append violation to chat context
violation_text = f"Safety violation detected: {result['violation_summary']}\nRecommendations: {result['recommendations']}"
assistant.chat_ctx.append(
role="user",
text=violation_text
)
log_message(f"Added violation to chat context: {violation_text}")

# Trigger assistant response
response = f"I noticed that {result['violation_summary']}. {result['recommendations']}"
log_message(f"Triggering assistant response: {response}")

# TODO: instead of saying the predetermined response, we'll trigger an assistant response here
# we can append the current video frame that triggered the violation to the chat context
stream = assistant.llm.chat()

await assistant.say(stream)
else:
log_message("No significant violations detected or severity below threshold")
except Exception as e:
log_message(f"Error in handle_instruction_check: {str(e)}")
log_message(f"Error traceback: {traceback.format_exc()}")


# Add this function to handle safety check callbacks
async def check_instruction_violation(
chat_ctx: ChatContext,
video_frame: rtc.VideoFrame,
) -> Dict[str, Any]:
"""Make a call to GPT-4 Vision to check for instruction violations"""
log_message("Creating new context for instruction check...")

try:
client = OpenAI()

try:
# Get raw RGBA data
frame_data = video_frame.data.tobytes()

# Create PIL Image from RGBA data
image = PIL_Image.frombytes('RGBA', (video_frame.width, video_frame.height), frame_data)

# Convert RGBA to RGB
rgb_image = image.convert('RGB')

# Save as JPEG
buffer = io.BytesIO()
rgb_image.save(buffer, format='JPEG')
jpeg_bytes = buffer.getvalue()

log_message(f"Got frame data, size: {len(jpeg_bytes)} bytes")
base64_image = base64.b64encode(jpeg_bytes).decode("utf-8")
log_message("Successfully encoded frame to base64")
except Exception as e:
log_message(f"Error encoding frame: {str(e)}")
raise

# Get the response
log_message("Making call to LLM for instruction check...")
try:
response = client.chat.completions.create(
model="gpt-4o-mini",
messages=[
# append chat context to prompt without images -- we'll need to parse them out
{
"role": "user",
"content": [
{"type": "text", "text": INSTRUCTIONS_PROMPT},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}",
},
},
],
}
],
max_tokens=300,
)
log_message(f"Raw LLM response: {response}")
except Exception as e:
log_message(f"Error making LLM call: {str(e)}")
raise

try:
# Parse the response content
result = json.loads(response.choices[0].message.content)
log_message(f"Successfully parsed LLM response: {json.dumps(result, indent=2)}")
return result
except Exception as e:
log_message(f"Error parsing LLM response: {str(e)}")
raise

except Exception as e:
log_message(f"Failed to process instruction check: {str(e)}")
log_message(f"Error traceback: {traceback.format_exc()}")
default_response = {
"violation_detected": False,
"severity_rating": 0,
"violation_summary": f"Error processing instruction check: {str(e)}",
"recommendations": "None"
}
log_message(f"Returning default response: {json.dumps(default_response, indent=2)}")
return default_response