docs(core): add clarity to base token counting methods (#33958)

mdrxy · web-flow · commit 275dcbf69fbf · 2025-11-13T17:15:47.000-05:00
Wasn't immediately obvious that `get_num_tokens_from_messages` adds
additional prefixes to represent user roles in conversation, which adds
to the overall token count.

```python
from langchain_google_genai import GoogleGenerativeAI

llm = GoogleGenerativeAI(model="gemini-2.5-flash")
num_tokens = llm.get_num_tokens("Hello, world!")
print(f"Number of tokens: {num_tokens}")
# Number of tokens: 4
```

```python
from langchain.messages import HumanMessage

messages = [HumanMessage(content="Hello, world!")]

num_tokens = llm.get_num_tokens_from_messages(messages)
print(f"Number of tokens: {num_tokens}")
# Number of tokens: 6
```
diff --git a/libs/core/langchain_core/language_models/base.py b/libs/core/langchain_core/language_models/base.py
@@ -299,6 +299,9 @@ def get_num_tokens(self, text: str) -> int:
 
         Useful for checking if an input fits in a model's context window.
 
+        This should be overridden by model-specific implementations to provide accurate
+        token counts via model-specific tokenizers.
+
         Args:
             text: The string input to tokenize.
 
@@ -317,9 +320,17 @@ def get_num_tokens_from_messages(
 
         Useful for checking if an input fits in a model's context window.
 
+        This should be overridden by model-specific implementations to provide accurate
+        token counts via model-specific tokenizers.
+
         !!! note
-            The base implementation of `get_num_tokens_from_messages` ignores tool
-            schemas.
+
+            * The base implementation of `get_num_tokens_from_messages` ignores tool
+                schemas.
+            * The base implementation of `get_num_tokens_from_messages` adds additional
+                prefixes to messages in represent user roles, which will add to the
+                overall token count. Model-specific implementations may choose to
+                handle this differently.
 
         Args:
             messages: The message inputs to tokenize.