Two-Weeks-Team · ComBba · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · chatgpt-codex-connector
diff --git a/internal/live/proxy.go b/internal/live/proxy.go
@@ -274,17 +274,36 @@ func (p *Proxy) handleServerContent(content *genai.LiveServerContent) {
 				p.sendBinary(part.InlineData.Data)
 			}
 			if part.Text != "" && !part.Thought {
-				// Capture non-thinking transcript for analyze_user context.
+				// Capture non-thinking transcript for tool context (analyze_user).
+				// Browser display uses OutputTranscription to avoid duplicates.
 				p.toolHandler.AddTranscript("model", part.Text)
-				// Forward transcript as JSON (skip model thinking/reasoning text).
-				p.sendJSON(map[string]any{
-					"type": "transcript",
-					"role": "model",
-					"text": part.Text,
-				})
 			}
 		}
 	}
+
+	// Forward input transcription (what the user said).
+	if content.InputTranscription != nil && content.InputTranscription.Text != "" {
+		// Only persist finalized user speech to tool context.
+		if content.InputTranscription.Finished {
+			p.toolHandler.AddTranscript("user", content.InputTranscription.Text)
+		}
+		p.sendJSON(map[string]any{
+			"type":     "transcript",
+			"role":     "user",
+			"text":     content.InputTranscription.Text,
+			"finished": content.InputTranscription.Finished,
+		})
+	}
+
+	// Forward output transcription (what the model said, as text).
+	if content.OutputTranscription != nil && content.OutputTranscription.Text != "" {
+		p.sendJSON(map[string]any{
+			"type":     "transcript",
+			"role":     "model",
+			"text":     content.OutputTranscription.Text,
+			"finished": content.OutputTranscription.Finished,
+		})
+	}
-	// Forward input transcription (what the user said).
-	if content.InputTranscription != nil && content.InputTranscription.Text != "" {
-		p.toolHandler.AddTranscript("user", content.InputTranscription.Text)
-		p.sendJSON(map[string]any{
-			"type":     "transcript",
-			"role":     "user",
-			"text":     content.InputTranscription.Text,
-			"finished": content.InputTranscription.Finished,
-		})
-	}
-
-	// Forward output transcription (what the model said, as text).
-	if content.OutputTranscription != nil && content.OutputTranscription.Text != "" {
-		p.sendJSON(map[string]any{
-			"type":     "transcript",
-			"role":     "model",
-			"text":     content.OutputTranscription.Text,
-			"finished": content.OutputTranscription.Finished,
-		})
-	}
+	// Forward input transcription (what the user said).
+	if content.InputTranscription != nil && content.InputTranscription.Text != "" {
+		if content.InputTranscription.Finished {
+			p.toolHandler.AddTranscript("user", content.InputTranscription.Text)
+		}
+		p.sendJSON(map[string]any{
+			"type":     "transcript",
+			"role":     "user",
+			"text":     content.InputTranscription.Text,
+			"finished": content.InputTranscription.Finished,
+		})
+	}
+
+	// Forward output transcription (what the model said, as text).
+	// Prefer a single model transcript source to avoid duplicates with ModelTurn.Part.Text.
+	if content.OutputTranscription != nil && content.OutputTranscription.Text != "" {
+		p.toolHandler.AddTranscript("model", content.OutputTranscription.Text)
+		p.sendJSON(map[string]any{
+			"type":     "transcript",
+			"role":     "model",
+			"text":     content.OutputTranscription.Text,
+			"finished": content.OutputTranscription.Finished,
+		})
+	}
-	// Forward input transcription (what the user said).
-	if content.InputTranscription != nil && content.InputTranscription.Text != "" {
-		p.toolHandler.AddTranscript("user", content.InputTranscription.Text)
-		p.sendJSON(map[string]any{
-			"type":     "transcript",
-			"role":     "user",
-			"text":     content.InputTranscription.Text,
-			"finished": content.InputTranscription.Finished,
-		})
-	}
-
-	// Forward output transcription (what the model said, as text).
-	if content.OutputTranscription != nil && content.OutputTranscription.Text != "" {
-		p.sendJSON(map[string]any{
-			"type":     "transcript",
-			"role":     "model",
-			"text":     content.OutputTranscription.Text,
-			"finished": content.OutputTranscription.Finished,
-		})
-	}
+	// Forward input transcription (what the user said).
+	if content.InputTranscription != nil && content.InputTranscription.Text != "" {
+		if content.InputTranscription.Finished {
+			p.toolHandler.AddTranscript("user", content.InputTranscription.Text)
+		}
+		p.sendJSON(map[string]any{
+			"type":     "transcript",
+			"role":     "user",
+			"text":     content.InputTranscription.Text,
+			"finished": content.InputTranscription.Finished,
+		})
+	}
+
+	// Forward output transcription (what the model said, as text).
+	// Prefer a single model transcript source to avoid duplicates with ModelTurn.Part.Text.
+	if content.OutputTranscription != nil && content.OutputTranscription.Text != "" {
+		p.toolHandler.AddTranscript("model", content.OutputTranscription.Text)
+		p.sendJSON(map[string]any{
+			"type":     "transcript",
+			"role":     "model",
+			"text":     content.OutputTranscription.Text,
+			"finished": content.OutputTranscription.Finished,
+		})
+	}
 }
 
 // handleToolCall executes a tool and sends the response back to Live API.

diff --git a/internal/session/manager.go b/internal/session/manager.go
@@ -168,6 +168,8 @@ func (m *Manager) BuildOnboardingConfig() *genai.LiveConnectConfig {
 				},
 			},
 		},
+		InputAudioTranscription:  &genai.AudioTranscriptionConfig{},
+		OutputAudioTranscription: &genai.AudioTranscriptionConfig{},
 		Proactivity: &genai.ProactivityConfig{
 			ProactiveAudio: &enableProactive,
 		},
@@ -231,7 +233,9 @@ func (m *Manager) BuildReunionConfig() *genai.LiveConnectConfig {
 				},
 			},
 		},
-		EnableAffectiveDialog: &enableAffective,
+		InputAudioTranscription:  &genai.AudioTranscriptionConfig{},
+		OutputAudioTranscription: &genai.AudioTranscriptionConfig{},
+		EnableAffectiveDialog:    &enableAffective,
 		Proactivity: &genai.ProactivityConfig{
 			ProactiveAudio: &enableProactive,
 		},

diff --git a/internal/session/manager_test.go b/internal/session/manager_test.go
@@ -45,6 +45,14 @@ func TestManager_StartOnboarding_Config(t *testing.T) {
 		t.Fatalf("expected English greeting in system instruction")
 	}
 
+	// Must have audio transcription enabled.
+	if cfg.InputAudioTranscription == nil {
+		t.Fatal("expected InputAudioTranscription config")
+	}
+	if cfg.OutputAudioTranscription == nil {
+		t.Fatal("expected OutputAudioTranscription config")
+	}
+
 	// Must have tools declared.
 	if len(cfg.Tools) == 0 {
 		t.Fatal("expected tools")
@@ -150,6 +158,14 @@ func TestManager_BuildReunionConfig(t *testing.T) {
 		t.Fatalf("expected personality in system instruction")
 	}
 
+	// Must have audio transcription.
+	if cfg.InputAudioTranscription == nil {
+		t.Fatal("expected InputAudioTranscription in reunion config")
+	}
+	if cfg.OutputAudioTranscription == nil {
+		t.Fatal("expected OutputAudioTranscription in reunion config")
+	}
+
 	// Must have tools.
 	if len(cfg.Tools) == 0 || len(cfg.Tools[0].FunctionDeclarations) == 0 {
 		t.Fatal("expected reunion tools")

diff --git a/web/app/page.tsx b/web/app/page.tsx
@@ -9,24 +9,22 @@ import SceneDisplay from '../components/SceneDisplay';
 import SessionTransition from '../components/SessionTransition';
 import OnboardingFlow, { type OnboardingStage } from '../components/OnboardingFlow';
 import BGMPlayer from '../components/BGMPlayer';
+import ChatPanel, { type ChatMessage } from '../components/ChatPanel';
+import StatusHUD from '../components/StatusHUD';
+import ActionsHUD from '../components/ActionsHUD';
 import type { YouTubeVideo } from '../components/YouTubeGrid';
 import type { Highlight } from '../components/HighlightCard';
 
 type TransitionPhase = 'idle' | 'transitioning' | 'ready';
 
-const CONNECTION_COLORS: Record<string, string> = {
-  connected: '#4ade80',
-  connecting: '#fbbf24',
-  disconnected: '#ef4444',
-  error: '#ef4444',
-};
-
 export default function Home() {
   const [started, setStarted] = useState(false);
   const [previewSrc, setPreviewSrc] = useState<string | null>(null);
   const [finalSrc, setFinalSrc] = useState<string | null>(null);
   const [transition, setTransition] = useState<TransitionPhase>('idle');
-  const [transcript, setTranscript] = useState<string>('');
+  const [chatMessages, setChatMessages] = useState<ChatMessage[]>([]);
+  const pendingMsgRef = useRef<{ model: string | null; user: string | null }>({ model: null, user: null });
+  const msgIdRef = useRef(0);
   const readyTimerRef = useRef<ReturnType<typeof setTimeout> | null>(null);
 
   // Onboarding state
@@ -38,7 +36,7 @@ export default function Home() {
   const [analysisPercent, setAnalysisPercent] = useState(0);
   const [bgmUrl, setBgmUrl] = useState<string | null>(null);
 
-  const { initAudioContext, playPCM, cleanup: cleanupAudio } = useAudio();
+  const { initAudioContext, playPCM, isPlaying, cleanup: cleanupAudio } = useAudio();
   const mic = useMicrophone();
 
   const handleMessage = useCallback((msg: ServerMessage) => {
@@ -60,9 +58,48 @@ export default function Home() {
         setTransition('ready');
         setOnboardingStage('reunion');
         break;
-      case 'transcript':
-        setTranscript(stripMarkdown(msg.text));
+      case 'transcript': {
+        const role = (msg as { role: string }).role as 'model' | 'user';
+        const text = stripMarkdown(msg.text);
+        const finished = (msg as { finished?: boolean }).finished ?? false;
+
+        if (finished) {
+          // Finalize: flush pending partial text into a completed message.
+          const pending = pendingMsgRef.current[role];
+          const finalText = pending ? pending + text : text;
+          pendingMsgRef.current[role] = null;
+          if (finalText) {
+            const id = String(msgIdRef.current++);
+            setChatMessages((prev) => {
+              // Remove the in-progress placeholder for this role if present.
+              const cleaned = prev.filter(
+                (m) => !(m.role === role && !m.finished),
+              );
+              return [...cleaned, { id, role, text: finalText, finished: true }];
+            });
+          } else {
+            // Empty finalize — just clean up the placeholder.
+            setChatMessages((prev) =>
+              prev.filter((m) => !(m.role === role && !m.finished)),
+            );
+          }
+        } else {
+          // Streaming partial: accumulate and show placeholder.
+          const accumulated = (pendingMsgRef.current[role] ?? '') + text;
+          pendingMsgRef.current[role] = accumulated;
+          const id = `pending-${role}`;
+          setChatMessages((prev) => {
+            const cleaned = prev.filter(
+              (m) => !(m.role === role && !m.finished),
+            );
+            return [
+              ...cleaned,
+              { id, role, text: accumulated, finished: false },
+            ];
+          });
+        }
         break;
+      }
       case 'youtube_videos':
         setVideos(msg.videos as YouTubeVideo[]);
         setOnboardingStage('youtube_grid');
@@ -143,7 +180,8 @@ export default function Home() {
     setPreviewSrc(null);
     setFinalSrc(null);
     setTransition('idle');
-    setTranscript('');
+    setChatMessages([]);
+    pendingMsgRef.current = { model: null, user: null };
     setOnboardingStage('welcome');
     setVideos([]);
     setPersonCrops([]);
@@ -371,64 +409,14 @@ export default function Home() {
         onSelectPerson={handleSelectPerson}
       />
 
-      {/* Connection indicator */}
-      <div
-        style={{
-          position: 'absolute',
-          top: '1rem',
-          right: '1rem',
-          display: 'flex',
-          alignItems: 'center',
-          gap: '0.5rem',
-          zIndex: 10,
-        }}
-      >
-        <div
-          style={{
-            width: 8,
-            height: 8,
-            borderRadius: '50%',
-            background: CONNECTION_COLORS[state] ?? '#ef4444',
-          }}
-        />
-        <span style={{ fontSize: '0.75rem', color: 'var(--color-muted)' }}>
-          {state}
-        </span>
-        {mic.isRecording && (
-          <div
-            style={{
-              width: 8,
-              height: 8,
-              borderRadius: '50%',
-              background: '#ef4444',
-              animation: 'pulse 1.5s infinite',
-            }}
-            title="Microphone active"
-          />
-        )}
-      </div>
-
-      {/* Transcript overlay */}
-      {transcript && (
-        <div
-          style={{
-            position: 'absolute',
-            bottom: '6rem',
-            left: '50%',
-            transform: 'translateX(-50%)',
-            maxWidth: '80%',
-            padding: '0.75rem 1.5rem',
-            background: 'rgba(0,0,0,0.6)',
-            borderRadius: '1rem',
-            color: 'var(--color-text)',
-            fontSize: '1rem',
-            textAlign: 'center',
-            zIndex: 10,
-          }}
-        >
-          {transcript}
-        </div>
-      )}
+      <StatusHUD
+        connection={state}
+        isRecording={mic.isRecording}
+        isPlaying={isPlaying}
+        sessionState={onboardingStage}
+      />
+      <ActionsHUD sessionState={onboardingStage} />
+      <ChatPanel messages={chatMessages} />
 
       {/* Stop button */}
       <button

diff --git a/web/components/ActionsHUD.tsx b/web/components/ActionsHUD.tsx
@@ -0,0 +1,58 @@
+'use client';
+
+type ActionsHUDProps = {
+  sessionState: string;
+};
+
+type ActionItem = {
+  label: string;
+  hint: string;
+};
+
+const ONBOARDING_ACTIONS: ActionItem[] = [
+  { label: 'Talk', hint: 'Tell missless who you miss' },
+  { label: 'Share Video', hint: 'Paste a YouTube link' },
+];
+
+const REUNION_ACTIONS: ActionItem[] = [
+  { label: 'Talk', hint: 'Have a conversation' },
+  { label: 'Scene', hint: '"Paint me a picture of..."' },
+  { label: 'Music', hint: '"Play something peaceful"' },
+  { label: 'Memory', hint: '"Remember when we..."' },
+  { label: 'Album', hint: '"Save this moment"' },
+];
+
+export default function ActionsHUD({ sessionState }: ActionsHUDProps) {
+  const actions = sessionState === 'reunion' ? REUNION_ACTIONS : ONBOARDING_ACTIONS;
+
+  return (
+    <div
+      style={{
+        position: 'absolute',
+        top: '1rem',
+        right: '1rem',
+        background: 'rgba(0,0,0,0.5)',
+        backdropFilter: 'blur(12px)',
+        borderRadius: '0.75rem',
+        padding: '0.625rem 0.875rem',
+        display: 'flex',
+        flexDirection: 'column',
+        gap: '0.375rem',
+        fontSize: '0.75rem',
+        color: 'var(--color-text)',
+        zIndex: 20,
+        minWidth: '140px',
+      }}
+    >
+      <div style={{ fontWeight: 600, fontSize: '0.8125rem', marginBottom: '0.125rem' }}>
+        You can...
+      </div>
+      {actions.map((a) => (
+        <div key={a.label} style={{ display: 'flex', flexDirection: 'column', gap: '0.0625rem' }}>
+          <span style={{ fontWeight: 500 }}>{a.label}</span>
+          <span style={{ color: 'var(--color-muted)', fontSize: '0.6875rem' }}>{a.hint}</span>
+        </div>
+      ))}
+    </div>
+  );
+}