Skip to content

Commit 230f750

Browse files
Merge pull request #463 from jinnigu:feature/inputAudioTranscription
PiperOrigin-RevId: 824680535
2 parents b177111 + 408913d commit 230f750

File tree

6 files changed

+179
-3
lines changed

6 files changed

+179
-3
lines changed

core/src/main/java/com/google/adk/agents/RunConfig.java

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,8 +48,12 @@ public enum StreamingMode {
4848

4949
public abstract @Nullable AudioTranscriptionConfig outputAudioTranscription();
5050

51+
public abstract @Nullable AudioTranscriptionConfig inputAudioTranscription();
52+
5153
public abstract int maxLlmCalls();
5254

55+
public abstract Builder toBuilder();
56+
5357
public static Builder builder() {
5458
return new AutoValue_RunConfig.Builder()
5559
.setSaveInputBlobsAsArtifacts(false)
@@ -65,7 +69,8 @@ public static Builder builder(RunConfig runConfig) {
6569
.setMaxLlmCalls(runConfig.maxLlmCalls())
6670
.setResponseModalities(runConfig.responseModalities())
6771
.setSpeechConfig(runConfig.speechConfig())
68-
.setOutputAudioTranscription(runConfig.outputAudioTranscription());
72+
.setOutputAudioTranscription(runConfig.outputAudioTranscription())
73+
.setInputAudioTranscription(runConfig.inputAudioTranscription());
6974
}
7075

7176
/** Builder for {@link RunConfig}. */
@@ -88,6 +93,10 @@ public abstract static class Builder {
8893
public abstract Builder setOutputAudioTranscription(
8994
AudioTranscriptionConfig outputAudioTranscription);
9095

96+
@CanIgnoreReturnValue
97+
public abstract Builder setInputAudioTranscription(
98+
AudioTranscriptionConfig inputAudioTranscription);
99+
91100
@CanIgnoreReturnValue
92101
public abstract Builder setMaxLlmCalls(int maxLlmCalls);
93102

core/src/main/java/com/google/adk/flows/llmflows/Basic.java

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,8 @@ public Single<RequestProcessor.RequestProcessingResult> processRequest(
4848
.ifPresent(liveConnectConfigBuilder::speechConfig);
4949
Optional.ofNullable(context.runConfig().outputAudioTranscription())
5050
.ifPresent(liveConnectConfigBuilder::outputAudioTranscription);
51+
Optional.ofNullable(context.runConfig().inputAudioTranscription())
52+
.ifPresent(liveConnectConfigBuilder::inputAudioTranscription);
5153

5254
LlmRequest.Builder builder =
5355
request.toBuilder()

core/src/main/java/com/google/adk/runner/Runner.java

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -397,8 +397,9 @@ private void copySessionStates(Session source, Session target) {
397397
private InvocationContext newInvocationContextForLive(
398398
Session session, Optional<LiveRequestQueue> liveRequestQueue, RunConfig runConfig) {
399399
RunConfig.Builder runConfigBuilder = RunConfig.builder(runConfig);
400-
if (!CollectionUtils.isNullOrEmpty(runConfig.responseModalities())
401-
&& liveRequestQueue.isPresent()) {
400+
if (liveRequestQueue.isPresent() && !this.agent.subAgents().isEmpty()) {
401+
// Parity with Python: apply modality defaults and transcription settings
402+
// only for multi-agent live scenarios.
402403
// Default to AUDIO modality if not specified.
403404
if (CollectionUtils.isNullOrEmpty(runConfig.responseModalities())) {
404405
runConfigBuilder.setResponseModalities(
@@ -411,6 +412,10 @@ private InvocationContext newInvocationContextForLive(
411412
runConfigBuilder.setOutputAudioTranscription(AudioTranscriptionConfig.builder().build());
412413
}
413414
}
415+
// Need input transcription for agent transferring in live mode.
416+
if (runConfig.inputAudioTranscription() == null) {
417+
runConfigBuilder.setInputAudioTranscription(AudioTranscriptionConfig.builder().build());
418+
}
414419
}
415420
return newInvocationContext(
416421
session, /* newMessage= */ Optional.empty(), liveRequestQueue, runConfigBuilder.build());

core/src/test/java/com/google/adk/agents/RunConfigTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ public void testBuilderWithVariousValues() {
2525
.setSaveInputBlobsAsArtifacts(true)
2626
.setStreamingMode(RunConfig.StreamingMode.SSE)
2727
.setOutputAudioTranscription(audioTranscriptionConfig)
28+
.setInputAudioTranscription(audioTranscriptionConfig)
2829
.setMaxLlmCalls(10)
2930
.build();
3031

@@ -33,6 +34,7 @@ public void testBuilderWithVariousValues() {
3334
assertThat(runConfig.saveInputBlobsAsArtifacts()).isTrue();
3435
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.SSE);
3536
assertThat(runConfig.outputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
37+
assertThat(runConfig.inputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
3638
assertThat(runConfig.maxLlmCalls()).isEqualTo(10);
3739
}
3840

@@ -45,6 +47,7 @@ public void testBuilderDefaults() {
4547
assertThat(runConfig.saveInputBlobsAsArtifacts()).isFalse();
4648
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.NONE);
4749
assertThat(runConfig.outputAudioTranscription()).isNull();
50+
assertThat(runConfig.inputAudioTranscription()).isNull();
4851
assertThat(runConfig.maxLlmCalls()).isEqualTo(500);
4952
}
5053

@@ -66,6 +69,7 @@ public void testBuilderWithDifferentValues() {
6669
.setSaveInputBlobsAsArtifacts(true)
6770
.setStreamingMode(RunConfig.StreamingMode.BIDI)
6871
.setOutputAudioTranscription(audioTranscriptionConfig)
72+
.setInputAudioTranscription(audioTranscriptionConfig)
6973
.setMaxLlmCalls(20)
7074
.build();
7175

@@ -74,6 +78,24 @@ public void testBuilderWithDifferentValues() {
7478
assertThat(runConfig.saveInputBlobsAsArtifacts()).isTrue();
7579
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.BIDI);
7680
assertThat(runConfig.outputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
81+
assertThat(runConfig.inputAudioTranscription()).isEqualTo(audioTranscriptionConfig);
7782
assertThat(runConfig.maxLlmCalls()).isEqualTo(20);
7883
}
84+
85+
@Test
86+
public void testInputAudioTranscriptionOnly() {
87+
AudioTranscriptionConfig inputTranscriptionConfig = AudioTranscriptionConfig.builder().build();
88+
89+
RunConfig runConfig =
90+
RunConfig.builder()
91+
.setStreamingMode(RunConfig.StreamingMode.BIDI)
92+
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
93+
.setInputAudioTranscription(inputTranscriptionConfig)
94+
.build();
95+
96+
assertThat(runConfig.inputAudioTranscription()).isEqualTo(inputTranscriptionConfig);
97+
assertThat(runConfig.outputAudioTranscription()).isNull();
98+
assertThat(runConfig.streamingMode()).isEqualTo(RunConfig.StreamingMode.BIDI);
99+
assertThat(runConfig.responseModalities()).containsExactly(new Modality(Modality.Known.AUDIO));
100+
}
79101
}

core/src/test/java/com/google/adk/flows/llmflows/BasicTest.java

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -220,13 +220,33 @@ public void processRequest_buildsLiveConnectConfigFromRunConfig_outputAudioTrans
220220
assertThat(result.events()).isEmpty();
221221
}
222222

223+
@Test
224+
public void processRequest_buildsLiveConnectConfigFromRunConfig_inputAudioTranscription() {
225+
RunConfig runConfig =
226+
RunConfig.builder().setInputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG).build();
227+
LlmAgent agentWithConfig = LlmAgent.builder().name("agentWithConfig").model(testLlm).build();
228+
InvocationContext contextWithRunConfig = createInvocationContext(agentWithConfig, runConfig);
229+
230+
RequestProcessingResult result =
231+
basicProcessor.processRequest(contextWithRunConfig, initialRequest).blockingGet();
232+
233+
LlmRequest updatedRequest = result.updatedRequest();
234+
assertThat(updatedRequest.liveConnectConfig()).isNotNull();
235+
assertThat(updatedRequest.liveConnectConfig().responseModalities().get()).isEmpty();
236+
assertThat(updatedRequest.liveConnectConfig().speechConfig()).isEmpty();
237+
assertThat(updatedRequest.liveConnectConfig().inputAudioTranscription())
238+
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
239+
assertThat(result.events()).isEmpty();
240+
}
241+
223242
@Test
224243
public void processRequest_buildsLiveConnectConfigFromRunConfig_allFields() {
225244
RunConfig runConfig =
226245
RunConfig.builder()
227246
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
228247
.setSpeechConfig(TEST_SPEECH_CONFIG)
229248
.setOutputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG)
249+
.setInputAudioTranscription(TEST_AUDIO_TRANSCRIPTION_CONFIG)
230250
.build();
231251
LlmAgent agentWithConfig = LlmAgent.builder().name("agentWithConfig").model(testLlm).build();
232252
InvocationContext contextWithRunConfig = createInvocationContext(agentWithConfig, runConfig);
@@ -241,6 +261,8 @@ public void processRequest_buildsLiveConnectConfigFromRunConfig_allFields() {
241261
assertThat(updatedRequest.liveConnectConfig().speechConfig()).hasValue(TEST_SPEECH_CONFIG);
242262
assertThat(updatedRequest.liveConnectConfig().outputAudioTranscription())
243263
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
264+
assertThat(updatedRequest.liveConnectConfig().inputAudioTranscription())
265+
.hasValue(TEST_AUDIO_TRANSCRIPTION_CONFIG);
244266
assertThat(result.events()).isEmpty();
245267
}
246268
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*
2+
* Copyright 2025 Google LLC
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
package com.google.adk.runner;
18+
19+
import static com.google.adk.testing.TestUtils.createLlmResponse;
20+
import static com.google.adk.testing.TestUtils.createTestAgentBuilder;
21+
import static com.google.adk.testing.TestUtils.createTestLlm;
22+
import static com.google.common.truth.Truth.assertThat;
23+
24+
import com.google.adk.agents.InvocationContext;
25+
import com.google.adk.agents.LiveRequestQueue;
26+
import com.google.adk.agents.LlmAgent;
27+
import com.google.adk.agents.RunConfig;
28+
import com.google.adk.sessions.Session;
29+
import com.google.adk.testing.TestLlm;
30+
import com.google.common.collect.ImmutableList;
31+
import com.google.genai.types.AudioTranscriptionConfig;
32+
import com.google.genai.types.Content;
33+
import com.google.genai.types.Modality;
34+
import com.google.genai.types.Part;
35+
import java.lang.reflect.Method;
36+
import java.util.Optional;
37+
import org.junit.Test;
38+
import org.junit.runner.RunWith;
39+
import org.junit.runners.JUnit4;
40+
41+
@RunWith(JUnit4.class)
42+
public final class InputAudioTranscriptionTest {
43+
44+
private Content createContent(String text) {
45+
return Content.builder().parts(Part.builder().text(text).build()).build();
46+
}
47+
48+
private InvocationContext invokeNewInvocationContextForLive(
49+
Runner runner, Session session, LiveRequestQueue liveRequestQueue, RunConfig runConfig)
50+
throws Exception {
51+
Method method =
52+
Runner.class.getDeclaredMethod(
53+
"newInvocationContextForLive", Session.class, Optional.class, RunConfig.class);
54+
method.setAccessible(true);
55+
return (InvocationContext)
56+
method.invoke(runner, session, Optional.of(liveRequestQueue), runConfig);
57+
}
58+
59+
@Test
60+
public void newInvocationContextForLive_multiAgent_autoConfiguresInputAudioTranscription()
61+
throws Exception {
62+
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
63+
LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
64+
LlmAgent rootAgent =
65+
createTestAgentBuilder(testLlm)
66+
.name("root_agent")
67+
.subAgents(ImmutableList.of(subAgent))
68+
.build();
69+
70+
Runner runner = new InMemoryRunner(rootAgent, "test", ImmutableList.of());
71+
Session session = runner.sessionService().createSession("test", "user").blockingGet();
72+
73+
RunConfig initialConfig =
74+
RunConfig.builder()
75+
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
76+
.setStreamingMode(RunConfig.StreamingMode.BIDI)
77+
.build();
78+
79+
assertThat(initialConfig.inputAudioTranscription()).isNull();
80+
81+
LiveRequestQueue liveQueue = new LiveRequestQueue();
82+
InvocationContext context =
83+
invokeNewInvocationContextForLive(runner, session, liveQueue, initialConfig);
84+
85+
assertThat(context.runConfig().inputAudioTranscription()).isNotNull();
86+
}
87+
88+
@Test
89+
public void newInvocationContextForLive_explicitConfig_preservesUserInputAudioTranscription()
90+
throws Exception {
91+
TestLlm testLlm = createTestLlm(createLlmResponse(createContent("response")));
92+
LlmAgent subAgent = createTestAgentBuilder(testLlm).name("sub_agent").build();
93+
LlmAgent rootAgent =
94+
createTestAgentBuilder(testLlm)
95+
.name("root_agent")
96+
.subAgents(ImmutableList.of(subAgent))
97+
.build();
98+
99+
Runner runner = new InMemoryRunner(rootAgent, "test", ImmutableList.of());
100+
Session session = runner.sessionService().createSession("test", "user").blockingGet();
101+
102+
AudioTranscriptionConfig userConfig = AudioTranscriptionConfig.builder().build();
103+
RunConfig configWithUserSetting =
104+
RunConfig.builder()
105+
.setResponseModalities(ImmutableList.of(new Modality(Modality.Known.AUDIO)))
106+
.setStreamingMode(RunConfig.StreamingMode.BIDI)
107+
.setInputAudioTranscription(userConfig)
108+
.build();
109+
110+
LiveRequestQueue liveQueue = new LiveRequestQueue();
111+
InvocationContext context =
112+
invokeNewInvocationContextForLive(runner, session, liveQueue, configWithUserSetting);
113+
114+
assertThat(context.runConfig().inputAudioTranscription()).isSameInstanceAs(userConfig);
115+
}
116+
}

0 commit comments

Comments
 (0)