From 1c0194fca9914651f8a5878e76beba6a6c629d75 Mon Sep 17 00:00:00 2001 From: gongzhongqiang Date: Thu, 20 Mar 2025 20:15:58 +0800 Subject: [PATCH] feat(ollama): Add min_p parameter for improved sampling control - Add min_p option - Add qwq model Signed-off-by: gongzhongqiang --- .../ai/ollama/api/OllamaModel.java | 5 +++ .../ai/ollama/api/OllamaOptions.java | 31 +++++++++++++++++-- .../ROOT/pages/api/chat/ollama-chat.adoc | 1 + .../api/embeddings/ollama-embeddings.adoc | 1 + 4 files changed, 35 insertions(+), 3 deletions(-) diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaModel.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaModel.java index fd7e7048a77..7602eca2584 100644 --- a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaModel.java +++ b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaModel.java @@ -32,6 +32,11 @@ public enum OllamaModel implements ChatModelDescription { */ QWEN_2_5_7B("qwen2.5"), + /** + * QwQ is the reasoning model of the Qwen series. + */ + QWQ("qwq"), + /** * Llama 2 is a collection of language models ranging from 7B to 70B parameters. */ diff --git a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java index c6706e337d0..df2e9524f0c 100644 --- a/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java +++ b/models/spring-ai-ollama/src/main/java/org/springframework/ai/ollama/api/OllamaOptions.java @@ -191,6 +191,16 @@ public class OllamaOptions implements ToolCallingChatOptions, EmbeddingOptions { @JsonProperty("top_p") private Double topP; + /** + * Alternative to the top_p, and aims to ensure a balance of quality and variety. + * The parameter p represents the minimum probability for a token to be considered, + * relative to the probability of the most likely token. For example, with p=0.05 and + * the most likely token having a probability of 0.9, logits with a value + * less than 0.045 are filtered out. (Default: 0.0) + */ + @JsonProperty("min_p") + private Double minP; + /** * Tail free sampling is used to reduce the impact of less probable tokens * from the output. A higher value (e.g., 2.0) will reduce the impact more, while a @@ -372,6 +382,7 @@ public static OllamaOptions fromOptions(OllamaOptions fromOptions) { .numPredict(fromOptions.getNumPredict()) .topK(fromOptions.getTopK()) .topP(fromOptions.getTopP()) + .minP(fromOptions.getMinP()) .tfsZ(fromOptions.getTfsZ()) .typicalP(fromOptions.getTypicalP()) .repeatLastN(fromOptions.getRepeatLastN()) @@ -567,6 +578,14 @@ public void setTopP(Double topP) { this.topP = topP; } + public Double getMinP() { + return this.minP; + } + + public void setMinP(Double minP) { + this.minP = minP; + } + public Float getTfsZ() { return this.tfsZ; } @@ -819,8 +838,9 @@ public boolean equals(Object o) { && Objects.equals(this.useMLock, that.useMLock) && Objects.equals(this.numThread, that.numThread) && Objects.equals(this.numKeep, that.numKeep) && Objects.equals(this.seed, that.seed) && Objects.equals(this.numPredict, that.numPredict) && Objects.equals(this.topK, that.topK) - && Objects.equals(this.topP, that.topP) && Objects.equals(this.tfsZ, that.tfsZ) - && Objects.equals(this.typicalP, that.typicalP) && Objects.equals(this.repeatLastN, that.repeatLastN) + && Objects.equals(this.topP, that.topP) && Objects.equals(this.minP, that.minP) + && Objects.equals(this.tfsZ, that.tfsZ) && Objects.equals(this.typicalP, that.typicalP) + && Objects.equals(this.repeatLastN, that.repeatLastN) && Objects.equals(this.temperature, that.temperature) && Objects.equals(this.repeatPenalty, that.repeatPenalty) && Objects.equals(this.presencePenalty, that.presencePenalty) @@ -838,7 +858,7 @@ public int hashCode() { return Objects.hash(this.model, this.format, this.keepAlive, this.truncate, this.useNUMA, this.numCtx, this.numBatch, this.numGPU, this.mainGPU, this.lowVRAM, this.f16KV, this.logitsAll, this.vocabOnly, this.useMMap, this.useMLock, this.numThread, this.numKeep, this.seed, this.numPredict, this.topK, - this.topP, this.tfsZ, this.typicalP, this.repeatLastN, this.temperature, this.repeatPenalty, + this.topP, this.minP, this.tfsZ, this.typicalP, this.repeatLastN, this.temperature, this.repeatPenalty, this.presencePenalty, this.frequencyPenalty, this.mirostat, this.mirostatTau, this.mirostatEta, this.penalizeNewline, this.stop, this.toolCallbacks, this.toolNames, this.internalToolExecutionEnabled, this.toolContext); @@ -958,6 +978,11 @@ public Builder topP(Double topP) { return this; } + public Builder minP(Double minP) { + this.options.minP = minP; + return this; + } + public Builder tfsZ(Float tfsZ) { this.options.tfsZ = tfsZ; return this; diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc index 7e6a0b40a9c..52fd5a2938d 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/chat/ollama-chat.adoc @@ -119,6 +119,7 @@ The remaining `options` properties are based on the link:https://github.com/olla | spring.ai.ollama.chat.options.num-predict | Maximum number of tokens to predict when generating text. (-1 = infinite generation, -2 = fill context) | -1 | spring.ai.ollama.chat.options.top-k | Reduces the probability of generating nonsense. A higher value (e.g., 100) will give more diverse answers, while a lower value (e.g., 10) will be more conservative. | 40 | spring.ai.ollama.chat.options.top-p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. | 0.9 +| spring.ai.ollama.chat.options.min-p | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. | 0.0 | spring.ai.ollama.chat.options.tfs-z | Tail-free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. | 1.0 | spring.ai.ollama.chat.options.typical-p | - | 1.0 | spring.ai.ollama.chat.options.repeat-last-n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | 64 diff --git a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc index ed99ad97fc1..0fd57812b71 100644 --- a/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc +++ b/spring-ai-docs/src/main/antora/modules/ROOT/pages/api/embeddings/ollama-embeddings.adoc @@ -124,6 +124,7 @@ The remaining `options` properties are based on the link:https://github.com/olla | spring.ai.ollama.embedding.options.num-predict | Maximum number of tokens to predict when generating text. (-1 = infinite generation, -2 = fill context) | -1 | spring.ai.ollama.embedding.options.top-k | Reduces the probability of generating nonsense. A higher value (e.g., 100) will give more diverse answers, while a lower value (e.g., 10) will be more conservative. | 40 | spring.ai.ollama.embedding.options.top-p | Works together with top-k. A higher value (e.g., 0.95) will lead to more diverse text, while a lower value (e.g., 0.5) will generate more focused and conservative text. | 0.9 +| spring.ai.ollama.embedding.options.min-p | Alternative to the top_p, and aims to ensure a balance of quality and variety. The parameter p represents the minimum probability for a token to be considered, relative to the probability of the most likely token. For example, with p=0.05 and the most likely token having a probability of 0.9, logits with a value less than 0.045 are filtered out. | 0.0 | spring.ai.ollama.embedding.options.tfs-z | Tail-free sampling is used to reduce the impact of less probable tokens from the output. A higher value (e.g., 2.0) will reduce the impact more, while a value of 1.0 disables this setting. | 1.0 | spring.ai.ollama.embedding.options.typical-p | - | 1.0 | spring.ai.ollama.embedding.options.repeat-last-n | Sets how far back for the model to look back to prevent repetition. (Default: 64, 0 = disabled, -1 = num_ctx) | 64