From f2a465f9f30dc2265ced0c2826e66a90f843cc26 Mon Sep 17 00:00:00 2001 From: Andrew Grande Date: Wed, 9 Oct 2024 13:12:00 -0700 Subject: [PATCH] Validate the embeddings size to catch silent embeddings batch failures With a larger batch size for `add_documents` (e.g. 1000), the embeddings service may silently fail and return nothing for some entries. This lead to the more cryptic error: ``` [values_dict[key][i] for key in values_dict] ~~~~~~~~~~~~~~~~^^^ IndexError: list index out of range ``` Add additional size validation and a suggestion on how to remedy. --- src/langchain_google_spanner/vector_store.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/langchain_google_spanner/vector_store.py b/src/langchain_google_spanner/vector_store.py index b5084e80..4be49a05 100644 --- a/src/langchain_google_spanner/vector_store.py +++ b/src/langchain_google_spanner/vector_store.py @@ -672,6 +672,13 @@ def add_texts( embeds = self._embedding_service.embed_documents(texts_list) + if len(embeds) != number_of_records: + raise ValueError( + "Number of embeddings should equal the number of documents." + "Try reducing the batch size when adding the documents." + f" Expected: {number_of_records}, but found {len(embeds)}" + ) + if metadatas is None: metadatas = [{} for _ in texts]