diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java index d9c9c7a4452..c52053ed915 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioSpeechProperties.java @@ -36,7 +36,7 @@ public class OpenAiAudioSpeechProperties extends OpenAiParentProperties { public static final String CONFIG_PREFIX = "spring.ai.openai.audio.speech"; - public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.TTS_1.getValue(); + public static final String DEFAULT_SPEECH_MODEL = OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue(); private static final Float SPEED = 1.0f; diff --git a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java index 7499ac4cfca..14c071dd08e 100644 --- a/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java +++ b/auto-configurations/models/spring-ai-autoconfigure-model-openai/src/main/java/org/springframework/ai/model/openai/autoconfigure/OpenAiAudioTranscriptionProperties.java @@ -26,7 +26,7 @@ public class OpenAiAudioTranscriptionProperties extends OpenAiParentProperties { public static final String CONFIG_PREFIX = "spring.ai.openai.audio.transcription"; - public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.WhisperModel.WHISPER_1.getValue(); + public static final String DEFAULT_TRANSCRIPTION_MODEL = OpenAiAudioApi.TranscriptionModels.WHISPER_1.getValue(); private static final Double DEFAULT_TEMPERATURE = 0.7; diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java index 2ea52622216..1ecebb18896 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioSpeechModel.java @@ -80,7 +80,7 @@ public class OpenAiAudioSpeechModel implements SpeechModel, StreamingSpeechModel public OpenAiAudioSpeechModel(OpenAiAudioApi audioApi) { this(audioApi, OpenAiAudioSpeechOptions.builder() - .model(OpenAiAudioApi.TtsModel.TTS_1.getValue()) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue()) .responseFormat(AudioResponseFormat.MP3) .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) diff --git a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java index 4c7bb105648..8638ad0db48 100644 --- a/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java +++ b/models/spring-ai-openai/src/main/java/org/springframework/ai/openai/OpenAiAudioTranscriptionModel.java @@ -63,7 +63,7 @@ public class OpenAiAudioTranscriptionModel implements ModelTTS + * TTS */ public enum TtsModel { - // @formatter:off /** - * The latest text to speech model, optimized for speed. + * Text-to-speech model optimized for speed */ @JsonProperty("tts-1") TTS_1("tts-1"), /** - * The latest text to speech model, optimized for quality. + * Text-to-speech model optimized for quality. */ @JsonProperty("tts-1-hd") - TTS_1_HD("tts-1-hd"); - // @formatter:on + TTS_1_HD("tts-1-hd"), + /** + * Text-to-speech model powered by GPT-4o mini + */ + @JsonProperty("gpt-4o-mini-tts") + GPT_4O_MINI_TTS("gpt-4o-mini-tts"); public final String value; @@ -250,6 +254,7 @@ public String getValue() { * v2-large model is currently available through our API with the whisper-1 model * name. */ + @Deprecated public enum WhisperModel { // @formatter:off @@ -269,6 +274,45 @@ public String getValue() { } + /** + * The available models for the transcriptions API. Reference: + * + */ + public enum TranscriptionModels implements ChatModelDescription { + + /** + * Speech-to-text model powered by GPT-4o + */ + @JsonProperty("gpt-4o-transcribe") + GPT_4O_TRANSCRIBE("gpt-4o-transcribe"), + /** + * Speech-to-text model powered by GPT-4o mini + */ + @JsonProperty("gpt-4o-mini-transcribe") + GPT_4O_MINI_TRANSCRIBE("gpt-4o-mini-transcribe"), + /** + * General-purpose speech recognition model + */ + @JsonProperty("whisper-1") + WHISPER_1("whisper-1"); + + public final String value; + + TranscriptionModels(String value) { + this.value = value; + } + + public String getValue() { + return this.value; + } + + @Override + public String getName() { + return this.value; + } + + } + /** * The format of the transcript and translation outputs, in one of these options: * json, text, srt, verbose_json, or vtt. Defaults to json. @@ -416,7 +460,7 @@ public String getValue() { */ public static class Builder { - private String model = TtsModel.TTS_1.getValue(); + private String model = TtsModel.GPT_4O_MINI_TTS.getValue(); private String input; @@ -531,7 +575,7 @@ public static class Builder { private byte[] file; - private String model = WhisperModel.WHISPER_1.getValue(); + private String model = TranscriptionModels.WHISPER_1.getValue(); private String language; @@ -624,7 +668,7 @@ public static class Builder { private byte[] file; - private String model = WhisperModel.WHISPER_1.getValue(); + private String model = TranscriptionModels.WHISPER_1.getValue(); private String prompt; diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java index 6c933dec283..8680df3c170 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioApiIT.java @@ -29,8 +29,8 @@ import org.springframework.ai.openai.api.OpenAiAudioApi.StructuredResponse; import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionRequest; import org.springframework.ai.openai.api.OpenAiAudioApi.TranslationRequest; +import org.springframework.ai.openai.api.OpenAiAudioApi.TranscriptionModels; import org.springframework.ai.openai.api.OpenAiAudioApi.TtsModel; -import org.springframework.ai.openai.api.OpenAiAudioApi.WhisperModel; import org.springframework.util.FileCopyUtils; import static org.assertj.core.api.Assertions.assertThat; @@ -52,7 +52,7 @@ void speechTranscriptionAndTranslation() throws IOException { byte[] speech = this.audioApi .createSpeech(SpeechRequest.builder() - .model(TtsModel.TTS_1_HD.getValue()) + .model(TtsModel.GPT_4O_MINI_TTS.getValue()) .input("Hello, my name is Chris and I love Spring A.I.") .voice(Voice.ONYX.getValue()) .build()) @@ -64,7 +64,7 @@ void speechTranscriptionAndTranslation() throws IOException { StructuredResponse translation = this.audioApi .createTranslation( - TranslationRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(), + TranslationRequest.builder().model(TranscriptionModels.WHISPER_1.getValue()).file(speech).build(), StructuredResponse.class) .getBody(); @@ -72,7 +72,7 @@ void speechTranscriptionAndTranslation() throws IOException { StructuredResponse transcriptionEnglish = this.audioApi .createTranscription( - TranscriptionRequest.builder().model(WhisperModel.WHISPER_1.getValue()).file(speech).build(), + TranscriptionRequest.builder().model(TranscriptionModels.WHISPER_1.getValue()).file(speech).build(), StructuredResponse.class) .getBody(); diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java index 996035d09b7..e12e514d57a 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/api/OpenAiAudioModelNoOpApiKeysIT.java @@ -44,7 +44,7 @@ public class OpenAiAudioModelNoOpApiKeysIT { void checkNoOpKey() { assertThatThrownBy(() -> this.audioApi .createSpeech(OpenAiAudioApi.SpeechRequest.builder() - .model(OpenAiAudioApi.TtsModel.TTS_1_HD.getValue()) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.getValue()) .input("Hello, my name is Chris and I love Spring A.I.") .voice(OpenAiAudioApi.SpeechRequest.Voice.ONYX.getValue()) .build()) diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java index 7f4a81d3e3c..a882b2cedcc 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelIT.java @@ -64,7 +64,7 @@ void shouldGenerateNonEmptyMp3AudioFromSpeechPrompt() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); @@ -100,7 +100,7 @@ void speechRateLimitTest() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); @@ -120,7 +120,7 @@ void shouldStreamNonEmptyResponsesForValidSpeechPrompts() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", @@ -142,7 +142,7 @@ void speechVoicesTest(String voice) { .voice(voice) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!", speechOptions); diff --git a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java index 9d98b5d9b58..3897d820e85 100644 --- a/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java +++ b/models/spring-ai-openai/src/test/java/org/springframework/ai/openai/audio/speech/OpenAiSpeechModelWithSpeechResponseMetadataTests.java @@ -75,7 +75,7 @@ void aiResponseContainsImageResponseMetadata() { .voice(OpenAiAudioApi.SpeechRequest.Voice.ALLOY.getValue()) .speed(SPEED) .responseFormat(OpenAiAudioApi.SpeechRequest.AudioResponseFormat.MP3) - .model(OpenAiAudioApi.TtsModel.TTS_1.value) + .model(OpenAiAudioApi.TtsModel.GPT_4O_MINI_TTS.value) .build(); SpeechPrompt speechPrompt = new SpeechPrompt("Today is a wonderful day to build something people love!",