Source code for langchain_milvus.utils.sparse
import warnings
from abc import ABC, abstractmethod
from typing import Any, Dict, List
from langchain_core.runnables.config import run_in_executor
[docs]
class BaseSparseEmbedding(ABC):
"""Interface for Sparse embedding models.
You can inherit from it and implement your custom sparse embedding model.
By default, the asynchronous methods are implemented using the synchronous methods;
however, implementations may choose to override the asynchronous methods with
an async native implementation for performance reasons.
"""
[docs]
@abstractmethod
def embed_query(self, query: str) -> Dict[int, float]:
"""Embed query text."""
[docs]
@abstractmethod
def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
"""Embed search docs."""
[docs]
async def aembed_query(self, query: str) -> Dict[int, float]:
"""Asynchronous Embed query text.
Args:
query: Text to embed.
Returns:
Embedding.
"""
return await run_in_executor(None, self.embed_query, query)
[docs]
async def aembed_documents(self, texts: list[str]) -> List[Dict[int, float]]:
"""Asynchronous Embed search docs.
Args:
texts: List of text to embed.
Returns:
List of embeddings.
"""
return await run_in_executor(None, self.embed_documents, texts)
[docs]
class BM25SparseEmbedding(BaseSparseEmbedding):
"""Sparse embedding model based on BM25.
**Note: We recommend using the Milvus built-in BM25 function to implement sparse
embedding in your application.
This class is more of a reference because it requires the user to manage the corpus,
which is not practical. The Milvus built-in function solves this problem and makes
the BM25 sparse process easier and less frustrating for users.
For more information, please refer to:
https://milvus.io/docs/full-text-search.md#Full-Text-Search
and
https://github.com/milvus-io/bootcamp/blob/master/bootcamp/tutorials/integration/langchain/full_text_search_with_langchain.ipynb
**
This class uses the BM25 model in Milvus model to implement sparse vector embedding.
This model requires pymilvus[model] to be installed.
`pip install pymilvus[model]`
For more information please refer to:
https://milvus.io/docs/embed-with-bm25.md
"""
[docs]
def __init__(self, corpus: List[str], language: str = "en"):
warnings.warn(
"BM25SparseEmbedding class will be deprecated in the future. "
"We recommend using the Milvus built-in BM25 function instead, "
"which is easier to use "
"and doesn't require manual corpus management. "
"For more information, please refer to: "
"https://milvus.io/docs/full-text-search.md#Full-Text-Search",
DeprecationWarning,
stacklevel=2,
)
from pymilvus.model.sparse import BM25EmbeddingFunction # type: ignore
from pymilvus.model.sparse.bm25.tokenizers import ( # type: ignore
build_default_analyzer,
)
self.analyzer = build_default_analyzer(language=language)
self.bm25_ef = BM25EmbeddingFunction(self.analyzer, num_workers=1)
self.bm25_ef.fit(corpus)
[docs]
def embed_query(self, text: str) -> Dict[int, float]:
return self._sparse_to_dict(self.bm25_ef.encode_queries([text]))
[docs]
def embed_documents(self, texts: List[str]) -> List[Dict[int, float]]:
sparse_arrays = self.bm25_ef.encode_documents(texts)
return [self._sparse_to_dict(sparse_array) for sparse_array in sparse_arrays]
def _sparse_to_dict(self, sparse_array: Any) -> Dict[int, float]:
if sparse_array.ndim == 1: # for scipy>=1.15.0 , the ndim is 1
# `i` is a tuple with one element
return {i[0]: sparse_array[i] for i in zip(*sparse_array.nonzero())}
else: # for scipy<1.15.0, the ndim is 2
return {j: sparse_array[i, j] for i, j in zip(*sparse_array.nonzero())}