Add token counting and context model detection to VoyageAI

fzowl · fzowl · commit 04461cdb670b · 2025-12-18T22:50:58.000+01:00
diff --git a/redisvl/utils/vectorize/voyageai.py b/redisvl/utils/vectorize/voyageai.py
@@ -13,6 +13,26 @@
 # ignore that voyageai isn't imported
 # mypy: disable-error-code="name-defined"
 
+# Token limits for VoyageAI models (used for token-aware batching)
+VOYAGE_TOTAL_TOKEN_LIMITS = {
+    "voyage-context-3": 32_000,
+    "voyage-3.5-lite": 1_000_000,
+    "voyage-3.5": 320_000,
+    "voyage-2": 320_000,
+    "voyage-3-large": 120_000,
+    "voyage-code-3": 120_000,
+    "voyage-large-2-instruct": 120_000,
+    "voyage-finance-2": 120_000,
+    "voyage-multilingual-2": 120_000,
+    "voyage-law-2": 120_000,
+    "voyage-large-2": 120_000,
+    "voyage-3": 120_000,
+    "voyage-3-lite": 120_000,
+    "voyage-code-2": 120_000,
+    "voyage-multimodal-3": 32_000,
+    "voyage-multimodal-3.5": 32_000,
+}
+
 
 class VoyageAIVectorizer(BaseVectorizer):
     """The VoyageAIVectorizer class utilizes VoyageAI's API to generate
@@ -87,6 +107,21 @@ class VoyageAIVectorizer(BaseVectorizer):
             input_type="query"
         )
 
+        # Using contextualized embeddings (voyage-context-3)
+        context_vectorizer = VoyageAIVectorizer(
+            model="voyage-context-3",
+            api_config={"api_key": "your-voyageai-api-key"}
+        )
+        # Context models automatically use contextualized_embed API
+        context_embeddings = context_vectorizer.embed_many(
+            contents=["chunk 1", "chunk 2", "chunk 3"],
+            input_type="document"
+        )
+
+        # Token counting for API usage management
+        token_counts = vectorizer.count_tokens(["text one", "text two"])
+        print(f"Token counts: {token_counts}")
+
     """
 
     model_config = ConfigDict(arbitrary_types_allowed=True)
@@ -448,6 +483,80 @@ def _serialize_for_cache(self, content: Any) -> Union[bytes, str]:
             return content.to_bytes()
         return super()._serialize_for_cache(content)
 
+    def _is_context_model(self) -> bool:
+        """
+        Check if the current model is a contextualized embedding model.
+
+        Contextualized models (like voyage-context-3) use a different API
+        endpoint and expect inputs formatted differently.
+
+        Returns:
+            bool: True if the model is a context model, False otherwise.
+        """
+        return "context" in self.model
+
+    def count_tokens(self, texts: List[str]) -> List[int]:
+        """
+        Count tokens for the given texts using VoyageAI's tokenization API.
+
+        This is useful for managing API usage and optimizing batching strategies.
+
+        Args:
+            texts: List of texts to count tokens for.
+
+        Returns:
+            List[int]: List of token counts for each text.
+
+        Raises:
+            ValueError: If tokenization fails.
+
+        Example:
+            >>> vectorizer = VoyageAIVectorizer(model="voyage-3.5")
+            >>> token_counts = vectorizer.count_tokens(["Hello world", "Another text"])
+            >>> print(token_counts)  # [2, 2]
+        """
+        if not texts:
+            return []
+
+        try:
+            token_lists = self._client.tokenize(texts, model=self.model)
+            return [len(token_list) for token_list in token_lists]
+        except Exception as e:
+            raise ValueError(f"Token counting failed: {e}")
+
+    async def acount_tokens(self, texts: List[str]) -> List[int]:
+        """
+        Asynchronously count tokens for the given texts using VoyageAI's tokenization API.
+
+        This is useful for managing API usage and optimizing batching strategies.
+
+        Note: The underlying VoyageAI tokenize API is synchronous, so this method
+        provides async compatibility but doesn't offer true async performance benefits.
+
+        Args:
+            texts: List of texts to count tokens for.
+
+        Returns:
+            List[int]: List of token counts for each text.
+
+        Raises:
+            ValueError: If tokenization fails.
+
+        Example:
+            >>> vectorizer = VoyageAIVectorizer(model="voyage-3.5")
+            >>> token_counts = await vectorizer.acount_tokens(["Hello world", "Another text"])
+            >>> print(token_counts)  # [2, 2]
+        """
+        if not texts:
+            return []
+
+        try:
+            # Note: VoyageAI's tokenize is synchronous even on AsyncClient
+            token_lists = self._aclient.tokenize(texts, model=self.model)
+            return [len(token_list) for token_list in token_lists]
+        except Exception as e:
+            raise ValueError(f"Token counting failed: {e}")
+
     @property
     def type(self) -> str:
         return "voyageai"
diff --git a/tests/integration/test_vectorizers.py b/tests/integration/test_vectorizers.py
@@ -629,3 +629,70 @@ def test_deprecated_text_parameter_warning():
         embeddings = vectorizer.embed_many(texts=TEST_TEXTS)
     assert isinstance(embeddings, list)
     assert len(embeddings) == len(TEST_TEXTS)
+
+
+# VoyageAI-specific tests for token counting and context model detection
+@pytest.mark.requires_api_keys
+def test_voyageai_count_tokens():
+    """Test VoyageAI token counting functionality."""
+    vectorizer = VoyageAIVectorizer(model="voyage-3.5")
+    texts = ["Hello world", "This is a longer test sentence."]
+
+    token_counts = vectorizer.count_tokens(texts)
+    assert isinstance(token_counts, list)
+    assert len(token_counts) == len(texts)
+    assert all(isinstance(count, int) and count > 0 for count in token_counts)
+
+    # Empty list should return empty list
+    assert vectorizer.count_tokens([]) == []
+
+
+@pytest.mark.requires_api_keys
+@pytest.mark.asyncio
+async def test_voyageai_acount_tokens():
+    """Test VoyageAI async token counting functionality."""
+    vectorizer = VoyageAIVectorizer(model="voyage-3.5")
+    texts = ["Hello world", "This is a longer test sentence."]
+
+    token_counts = await vectorizer.acount_tokens(texts)
+    assert isinstance(token_counts, list)
+    assert len(token_counts) == len(texts)
+    assert all(isinstance(count, int) and count > 0 for count in token_counts)
+
+    # Empty list should return empty list
+    assert await vectorizer.acount_tokens([]) == []
+
+
+def test_voyageai_token_limits():
+    """Test VoyageAI token limit constants."""
+    from redisvl.utils.vectorize.voyageai import VOYAGE_TOTAL_TOKEN_LIMITS
+
+    # Verify token limits are defined correctly
+    assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-context-3") == 32_000
+    assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-3.5-lite") == 1_000_000
+    assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-3.5") == 320_000
+    assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-multimodal-3") == 32_000
+    assert VOYAGE_TOTAL_TOKEN_LIMITS.get("voyage-multimodal-3.5") == 32_000
+
+    # Default for unknown models
+    assert VOYAGE_TOTAL_TOKEN_LIMITS.get("unknown-model", 120_000) == 120_000
+
+
+def test_voyageai_context_model_detection():
+    """Test detection of contextualized embedding models."""
+    # Test the context model detection logic directly
+    # The method checks if "context" is in the model name
+    assert "context" not in "voyage-3.5"
+    assert "context" in "voyage-context-3"
+    assert "context" not in "voyage-multimodal-3.5"
+
+    # Verify the detection would work correctly for known models
+    test_cases = [
+        ("voyage-3.5", False),
+        ("voyage-context-3", True),
+        ("voyage-multimodal-3.5", False),
+        ("voyage-3-large", False),
+    ]
+    for model_name, expected in test_cases:
+        # The _is_context_model method simply checks: "context" in self.model
+        assert ("context" in model_name) == expected, f"Failed for {model_name}"