Skip to content

Commit 7487ab2

Browse files
Merge pull request #556 from jinnigu:feature/single-agent-live-audio-transcription
PiperOrigin-RevId: 827251208
2 parents ea8b527 + b56149c commit 7487ab2

File tree

6 files changed

+277
-6
lines changed

6 files changed

+277
-6
lines changed

core/src/main/java/com/google/adk/runner/Runner.java

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -397,9 +397,7 @@ private void copySessionStates(Session source, Session target) {
397397
private InvocationContext newInvocationContextForLive(
398398
Session session, Optional<LiveRequestQueue> liveRequestQueue, RunConfig runConfig) {
399399
RunConfig.Builder runConfigBuilder = RunConfig.builder(runConfig);
400-
if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) {
401-
// Parity with Python: apply modality defaults and transcription settings
402-
// only for multi-agent live scenarios.
400+
if (liveRequestQueue.isPresent()) {
403401
// Default to AUDIO modality if not specified.
404402
if (CollectionUtils.isNullOrEmpty(runConfig.responseModalities())) {
405403
runConfigBuilder.setResponseModalities(

core/src/test/java/com/google/adk/runner/InputAudioTranscriptionTest.java

Lines changed: 52 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,7 @@ private InvocationContext invokeNewInvocationContextForLive(
5757
}
5858

5959
@Test
60-
public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTranscription()
61-
throws Exception {
60+
public void newInvocationContextForLive_autoConfiguresInputAudioTranscription() throws Exception {
6261
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
6362
LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
6463
LlmAgent rootAgent =
@@ -86,7 +85,7 @@ public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTrans
8685
}
8786

8887
@Test
89-
public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTranscription()
88+
public void newInvocationContextForLive_multiAgent_preservesUserInputAudioTranscription()
9089
throws Exception {
9190
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
9291
LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
@@ -113,4 +112,54 @@ public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTr
113112

114113
assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig);
115114
}
115+
116+
@Test
117+
public void newInvocationContextForLive_singleAgent_autoConfiguresInputAudioTranscription()
118+
throws Exception {
119+
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
120+
// Single agent with NO sub-agents
121+
LlmAgent singleAgent = createTestAgentBuilder(testLlm).name("weather_agent").build();
122+
123+
Runner runner = new InMemoryRunner(singleAgent, "test", ImmutableList.of());
124+
Session session = runner.sessionService().createSession("test", "user").blockingGet();
125+
126+
RunConfig initialConfig =
127+
RunConfig.builder()
128+
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
129+
.setStreamingMode(RunConfig.StreamingMode.BIDI)
130+
.build();
131+
132+
assertThat(initialConfig.inputAudioTranscription()).isNull();
133+
134+
LiveRequestQueue liveQueue = new LiveRequestQueue();
135+
InvocationContext context =
136+
invokeNewInvocationContextForLive(runner, session, liveQueue, initialConfig);
137+
138+
assertThat(context.runConfig().inputAudioTranscription()).isNotNull();
139+
}
140+
141+
@Test
142+
public void newInvocationContextForLive_singleAgent_preservesUserInputAudioTranscription()
143+
throws Exception {
144+
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
145+
// Single agent with NO sub-agents
146+
LlmAgent singleAgent = createTestAgentBuilder(testLlm).name("weather_agent").build();
147+
148+
Runner runner = new InMemoryRunner(singleAgent, "test", ImmutableList.of());
149+
Session session = runner.sessionService().createSession("test", "user").blockingGet();
150+
151+
AudioTranscriptionConfig userConfig = AudioTranscriptionConfig.builder().build();
152+
RunConfig configWithUserSetting =
153+
RunConfig.builder()
154+
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
155+
.setStreamingMode(RunConfig.StreamingMode.BIDI)
156+
.setInputAudioTranscription(userConfig)
157+
.build();
158+
159+
LiveRequestQueue liveQueue = new LiveRequestQueue();
160+
InvocationContext context =
161+
invokeNewInvocationContextForLive(runner, session, liveQueue, configWithUserSetting);
162+
163+
assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig);
164+
}
116165
}

pom.xml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
<module>contrib/spring-ai</module>
3333
<module>contrib/samples</module>
3434
<module>tutorials/city-time-weather</module>
35+
<module>tutorials/live-audio-single-agent</module>
3536
<module>a2a</module>
3637
<module>a2a/webservice</module>
3738
</modules>
Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,64 @@
1+
# Live Audio Single-Agent
2+
3+
A tutorial demonstrating how the ADK (Agent Development Kit) automatically configures **inputAudioTranscription** and **outputAudioTranscription** for single-agent live scenarios. This tutorial showcases that the feature now works for all live scenarios, not just multi-agent scenarios.
4+
5+
## What This Demonstrates
6+
7+
This tutorial verifies the feature change in `Runner.java` that enables automatic transcription configuration for all live scenarios:
8+
9+
**Before:** Only multi-agent scenarios got automatic transcription
10+
```java
11+
if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty())
12+
```
13+
14+
**After:** All live scenarios (including single-agent) get automatic transcription
15+
```java
16+
if (liveRequestQueue.isPresent())
17+
```
18+
19+
When you use this single-agent with live audio, the ADK automatically configures:
20+
- **inputAudioTranscription** - Transcribes user speech to text
21+
- **outputAudioTranscription** - Transcribes agent speech to text
22+
23+
## Setup API Key
24+
25+
```shell
26+
export GOOGLE_GENAI_API_KEY={YOUR-KEY}
27+
```
28+
29+
## Go to Tutorial Directory
30+
31+
```shell
32+
cd tutorials/live-audio-single-agent
33+
```
34+
35+
## Running the Agent
36+
37+
Start the server:
38+
39+
```shell
40+
mvn exec:java
41+
```
42+
43+
This starts the ADK web server with a single weather agent (`weather_agent`) that supports live audio using the `gemini-2.0-flash-live-001` model.
44+
45+
## Usage
46+
47+
Once running, you can interact with the agent through:
48+
- **Web interface:** `http://localhost:8080`
49+
- **Agent name:** `weather_agent`
50+
- **Try asking:** "What's the weather in Tokyo?" or "How's the weather in New York?"
51+
52+
### Testing with Live Audio
53+
54+
1. Open the web interface at `http://localhost:8080`
55+
2. Enable your microphone
56+
3. Speak to the agent: "What's the weather in Tokyo?"
57+
4. The agent will:
58+
- Automatically transcribe your speech to text (inputAudioTranscription)
59+
- Process the request and call the `getWeather` tool
60+
- Respond with audio (automatically transcribed via outputAudioTranscription)
61+
62+
## Learn More
63+
64+
See https://google.github.io/adk-docs/get-started/quickstart/#java for more information.
Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
<?xml version="1.0" encoding="UTF-8"?>
2+
<!--
3+
Copyright 2025 Google LLC
4+
5+
Licensed under the Apache License, Version 2.0 (the "License");
6+
you may not use this file except in compliance with the License.
7+
You may obtain a copy of the License at
8+
9+
http://www.apache.org/licenses/LICENSE-2.0
10+
11+
Unless required by applicable law or agreed to in writing, software
12+
distributed under the License is distributed on an "AS IS" BASIS,
13+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
See the License for the specific language governing permissions and
15+
limitations under the License.
16+
-->
17+
<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
18+
<modelVersion>4.0.0</modelVersion>
19+
20+
<parent>
21+
<groupId>com.google.adk</groupId>
22+
<artifactId>google-adk-parent</artifactId>
23+
<version>0.3.1-SNAPSHOT</version><!-- {x-version-update:google-adk:current} -->
24+
<relativePath>../../pom.xml</relativePath>
25+
</parent>
26+
27+
<artifactId>google-adk-tutorials-live-audio-single-agent</artifactId>
28+
<name>Agent Development Kit - Tutorial: Live Audio Single-Agent</name>
29+
30+
<properties>
31+
<exec.mainClass>com.google.adk.tutorials.LiveAudioSingleAgent</exec.mainClass>
32+
</properties>
33+
34+
<dependencies>
35+
<dependency>
36+
<groupId>com.google.adk</groupId>
37+
<artifactId>google-adk-dev</artifactId>
38+
<version>${project.version}</version>
39+
<exclusions>
40+
<exclusion>
41+
<groupId>ch.qos.logback</groupId>
42+
<artifactId>logback-classic</artifactId>
43+
</exclusion>
44+
</exclusions>
45+
</dependency>
46+
<dependency>
47+
<groupId>org.slf4j</groupId>
48+
<artifactId>slf4j-simple</artifactId>
49+
</dependency>
50+
</dependencies>
51+
</project>
52+
Lines changed: 107 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,107 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
package com.google.adk.tutorials;
17+
18+
import com.google.adk.agents.BaseAgent;
19+
import com.google.adk.agents.LlmAgent;
20+
import com.google.adk.tools.Annotations.Schema;
21+
import com.google.adk.tools.FunctionTool;
22+
import com.google.adk.web.AdkWebServer;
23+
import java.util.Map;
24+
25+
public class LiveAudioSingleAgent {
26+
27+
public static final BaseAgent WEATHER_AGENT =
28+
LlmAgent.builder()
29+
.name("weather_agent")
30+
.model("gemini-2.0-flash-live-001")
31+
.description("A helpful weather assistant that provides weather information.")
32+
.instruction(
33+
"You are a friendly weather assistant. When users ask about weather, "
34+
+ "you MUST call the getWeather tool with the location name. "
35+
+ "Extract the location from the user's question. "
36+
+ "ALWAYS use the getWeather tool to get accurate information - never make up weather data. "
37+
+ "After getting the tool result, provide a friendly and descriptive response. "
38+
+ "For general conversation or greetings, respond naturally and helpfully. "
39+
+ "Do NOT use code execution for anything.")
40+
.tools(FunctionTool.create(LiveAudioSingleAgent.class, "getWeather"))
41+
.build();
42+
43+
public static Map<String, String> getWeather(
44+
@Schema(name = "location", description = "The location for which to retrieve weather")
45+
String location) {
46+
47+
Map<String, Map<String, String>> weatherData =
48+
Map.of(
49+
"new york",
50+
Map.of(
51+
"status",
52+
"success",
53+
"temperature",
54+
"72°F (22°C)",
55+
"condition",
56+
"Partly cloudy",
57+
"report",
58+
"The weather in New York is partly cloudy with a temperature of 72°F (22°C)."),
59+
"london",
60+
Map.of(
61+
"status",
62+
"success",
63+
"temperature",
64+
"59°F (15°C)",
65+
"condition",
66+
"Rainy",
67+
"report",
68+
"The weather in London is rainy with a temperature of 59°F (15°C)."),
69+
"tokyo",
70+
Map.of(
71+
"status",
72+
"success",
73+
"temperature",
74+
"68°F (20°C)",
75+
"condition",
76+
"Clear",
77+
"report",
78+
"The weather in Tokyo is clear with a temperature of 68°F (20°C)."),
79+
"sydney",
80+
Map.of(
81+
"status",
82+
"success",
83+
"temperature",
84+
"77°F (25°C)",
85+
"condition",
86+
"Sunny",
87+
"report",
88+
"The weather in Sydney is sunny with a temperature of 77°F (25°C)."));
89+
90+
String normalizedLocation = location.toLowerCase().trim();
91+
92+
return weatherData.getOrDefault(
93+
normalizedLocation,
94+
Map.of(
95+
"status",
96+
"error",
97+
"report",
98+
String.format(
99+
"Weather information for '%s' is not available. Try New York, London, Tokyo, or"
100+
+ " Sydney.",
101+
location)));
102+
}
103+
104+
public static void main(String[] args) {
105+
AdkWebServer.start(WEATHER_AGENT);
106+
}
107+
}

0 commit comments

Comments
 (0)