add JinaEmbeddings class (#67)

JoanFM · web-flow · commit f222d7cd871e · 2023-11-20T16:37:23.000+05:30
* add JinaEmbeddings class

* fix tests dimensions

---------

Co-authored-by: Joan Fontanals Martinez &lt;joan.fontanals.martinez@jina.ai&gt;
diff --git a/fastembed/embedding.py b/fastembed/embedding.py
@@ -110,7 +110,7 @@ def __init__(
         self.tokenizer = self.load_tokenizer(self.path, max_length=max_length)
         self.model = ort.InferenceSession(str(model_path), providers=onnx_providers, sess_options=so)
 
-    def onnx_embed(self, documents: List[str]) -> np.ndarray:
+    def onnx_embed(self, documents: List[str]) -> Tuple[np.ndarray, np.ndarray]:
         encoded = self.tokenizer.encode_batch(documents)
         input_ids = np.array([e.ids for e in encoded])
         attention_mask = np.array([e.attention_mask for e in encoded])
@@ -126,9 +126,8 @@ def onnx_embed(self, documents: List[str]) -> np.ndarray:
             )
 
         model_output = self.model.run(None, onnx_input)
-        last_hidden_state = model_output[0][:, 0]
-        embeddings = normalize(last_hidden_state).astype(np.float32)
-        return embeddings
+        embeddings = model_output[0]
+        return embeddings, attention_mask
 
 
 class EmbeddingWorker(Worker):
@@ -150,8 +149,8 @@ def start(cls, path: Path, model_name: str, max_length: int = 512, **kwargs: Any
 
     def process(self, items: Iterable[Tuple[int, Any]]) -> Iterable[Tuple[int, Any]]:
         for idx, batch in items:
-            embeddings = self.model.onnx_embed(batch)
-            yield idx, embeddings
+            embeddings, attn_mask = self.model.onnx_embed(batch)
+            yield idx, (embeddings, attn_mask)
 
 
 class Embedding(ABC):
@@ -226,6 +225,18 @@ def list_supported_models(cls) -> List[Dict[str, Union[str, Union[int, float]]]]
                 "description": "Multilingual model, e5-large. Recommend using this model for non-English languages",
                 "size_in_GB": 2.24
             },
+            {
+                "model": "jinaai/jina-embeddings-v2-base-en",
+                "dim": 768,
+                "description": " English embedding model supporting 8192 sequence length",
+                "size_in_GB": 0.55
+            },
+            {
+                "model": "jinaai/jina-embeddings-v2-small-en",
+                "dim": 512,
+                "description": " English embedding model supporting 8192 sequence length",
+                "size_in_GB": 0.13
+            }
         ]
 
     @classmethod
@@ -282,6 +293,24 @@ def download_file_from_gcs(cls, url: str, output_path: str, show_progress: bool
                 progress_bar.close()
         return output_path
 
+    @classmethod
+    def download_files_from_huggingface(cls, repod_id: str, cache_dir: Optional[str] = None) -> str:
+        """
+        Downloads a model from HuggingFace Hub.
+        Args:
+            repod_id (str): The HF hub id (name) of the model to retrieve.
+            cache_dir (Optional[str]): The path to the cache directory.
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. "jinaai/jina-embeddings-v2-small-en".
+        Returns:
+            Path: The path to the model directory.
+        """
+        from huggingface_hub import snapshot_download
+
+        return snapshot_download(
+            repo_id=repod_id, ignore_patterns=["model.safetensors", "pytorch_model.bin"], cache_dir=cache_dir
+        )
+
     @classmethod
     def decompress_to_cache(cls, targz_path: str, cache_dir: str):
         """
@@ -317,7 +346,7 @@ def decompress_to_cache(cls, targz_path: str, cache_dir: str):
 
         return cache_dir
 
-    def retrieve_model(self, model_name: str, cache_dir: str) -> Path:
+    def retrieve_model_gcs(self, model_name: str, cache_dir: str) -> Path:
         """
         Retrieves a model from Google Cloud Storage.
 
@@ -361,6 +390,24 @@ def retrieve_model(self, model_name: str, cache_dir: str) -> Path:
 
         return model_dir
 
+    def retrieve_model_hf(self, model_name: str, cache_dir: str) -> Path:
+        """
+        Retrieves a model from HuggingFace Hub.
+        Args:
+            model_name (str): The name of the model to retrieve.
+            cache_dir (str): The path to the cache directory.
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        Returns:
+            Path: The path to the model directory.
+        """
+
+        assert (
+                "/" in model_name
+        ), "model_name must be in the format <org>/<model> e.g. jinaai/jina-embeddings-v2-small-en"
+
+        return Path(self.download_files_from_huggingface(repod_id=model_name, cache_dir=cache_dir))
+
     def passage_embed(self, texts: Iterable[str], **kwargs) -> Iterable[np.ndarray]:
         """
         Embeds a list of text passages into a list of embeddings.
@@ -425,7 +472,7 @@ def __init__(
             cache_dir.mkdir(parents=True, exist_ok=True)
 
         self._cache_dir = cache_dir
-        self._model_dir = self.retrieve_model(model_name, cache_dir)
+        self._model_dir = self.retrieve_model_gcs(model_name, cache_dir)
         self._max_length = max_length
 
         self.model = EmbeddingModel(self._model_dir, self.model_name, max_length=max_length,
@@ -464,7 +511,8 @@ def embed(
 
         if parallel is None or is_small:
             for batch in iter_batch(documents, batch_size):
-                yield from self.model.onnx_embed(batch)
+                embeddings, _ = self.model.onnx_embed(batch)
+                yield from normalize(embeddings[:, 0]).astype(np.float32)
         else:
             start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
             params = {
@@ -474,7 +522,16 @@ def embed(
             }
             pool = ParallelWorkerPool(parallel, EmbeddingWorker, start_method=start_method)
             for batch in pool.ordered_map(iter_batch(documents, batch_size), **params):
-                yield from batch
+                embeddings, _ = batch
+                yield from normalize(embeddings[:, 0]).astype(np.float32)
+
+    @classmethod
+    def list_supported_models(cls) -> List[Dict[str, Union[str, Union[int, float]]]]:
+        """
+        Lists the supported models.
+        """
+        # jina models are not supported by this class
+        return [model for model in super().list_supported_models() if not model['model'].startswith('jinaai')]
 
 
 class DefaultEmbedding(FlagEmbedding):
@@ -505,3 +562,97 @@ def embed(self, texts, batch_size: int = 256, parallel: int = None):
         # Use your OpenAI model to embed the texts
         # return self.model.embed(texts)
         raise NotImplementedError
+
+
+class JinaEmbedding(Embedding):
+    def __init__(
+            self,
+            model_name: str = "jinaai/jina-embeddings-v2-base-en",
+            max_length: int = 512,
+            cache_dir: str = None,
+            threads: int = None,
+    ):
+        """
+        Args:
+            model_name (str): The name of the model to use.
+            max_length (int, optional): The maximum number of tokens. Defaults to 512. Unknown behavior for values > 512.
+            cache_dir (str, optional): The path to the cache directory. Defaults to `local_cache` in the current directory.
+            threads (int, optional): The number of threads single onnxruntime session can use. Defaults to None.
+        Raises:
+            ValueError: If the model_name is not in the format <org>/<model> e.g. BAAI/bge-base-en.
+        """
+        self.model_name = model_name
+
+        if cache_dir is None:
+            cache_dir = Path(".").resolve() / "local_cache"
+            cache_dir.mkdir(parents=True, exist_ok=True)
+
+        self._cache_dir = cache_dir
+        self._model_dir = self.retrieve_model_hf(model_name, cache_dir)
+        self._max_length = max_length
+
+        self.model = EmbeddingModel(self._model_dir, self.model_name, max_length=max_length,
+                                    max_threads=threads)
+
+    def embed(
+            self, documents: Union[str, Iterable[str]], batch_size: int = 256, parallel: int = None
+    ) -> Iterable[np.ndarray]:
+        """
+        Encode a list of documents into list of embeddings.
+        We use mean pooling with attention so that the model can handle variable-length inputs.
+        Args:
+            documents: Iterator of documents or single document to embed
+            batch_size: Batch size for encoding -- higher values will use more memory, but be faster
+            parallel:
+                If > 1, data-parallel encoding will be used, recommended for offline encoding of large datasets.
+                If 0, use all available cores.
+                If None, don't use data-parallel processing, use default onnxruntime threading instead.
+        Returns:
+            List of embeddings, one per document
+        """
+        is_small = False
+
+        if isinstance(documents, str):
+            documents = [documents]
+            is_small = True
+
+        if isinstance(documents, list):
+            if len(documents) < batch_size:
+                is_small = True
+
+        if parallel == 0:
+            parallel = os.cpu_count()
+
+        if parallel is None or is_small:
+            for batch in iter_batch(documents, batch_size):
+                embeddings, attn_mask = self.model.onnx_embed(batch)
+                yield from normalize(self.mean_pooling(embeddings, attn_mask)).astype(np.float32)
+        else:
+            start_method = "forkserver" if "forkserver" in get_all_start_methods() else "spawn"
+            params = {
+                "path": self._model_dir,
+                "model_name": self.model_name,
+                "max_length": self._max_length,
+            }
+            pool = ParallelWorkerPool(parallel, EmbeddingWorker, start_method=start_method)
+            for batch in pool.ordered_map(iter_batch(documents, batch_size), **params):
+                embeddings, attn_mask = batch
+                yield from normalize(self.mean_pooling(embeddings, attn_mask)).astype(np.float32)
+
+    @classmethod
+    def list_supported_models(cls) -> List[Dict[str, Union[str, Union[int, float]]]]:
+        """
+        Lists the supported models.
+        """
+        # only jina models are supported by this class
+        return [model for model in Embedding.list_supported_models() if model['model'].startswith('jinaai')]
+
+    @staticmethod
+    def mean_pooling(model_output, attention_mask):
+        token_embeddings = model_output
+        input_mask_expanded = (np.expand_dims(attention_mask, axis=-1)).astype(float)
+
+        sum_embeddings = np.sum(token_embeddings * input_mask_expanded, axis=1)
+        mask_sum = np.clip(np.sum(input_mask_expanded, axis=1), a_min=1e-9, a_max=None)
+
+        return sum_embeddings / mask_sum
diff --git a/tests/test_onnx_embeddings.py b/tests/test_onnx_embeddings.py
@@ -1,30 +1,32 @@
 import os
-
+import pytest
 import numpy as np
-from tqdm import tqdm
 
-from fastembed.embedding import DefaultEmbedding, Embedding
+from fastembed.embedding import DefaultEmbedding, JinaEmbedding
 
 CANONICAL_VECTOR_VALUES = {
-    "BAAI/bge-small-en": np.array([-0.0232, -0.0255,  0.0174, -0.0639, -0.0006]),
-    "BAAI/bge-small-en-v1.5": np.array([0.01522374, -0.02271799,  0.00860278, -0.07424029,  0.00386434]),
-    "BAAI/bge-small-zh-v1.5": np.array([-0.01023294,  0.07634465,  0.0691722 , -0.04458365, -0.03160762]),
-    "BAAI/bge-base-en": np.array([0.0115,  0.0372,  0.0295,  0.0121,  0.0346]),
+    "BAAI/bge-small-en": np.array([-0.0232, -0.0255, 0.0174, -0.0639, -0.0006]),
+    "BAAI/bge-small-en-v1.5": np.array([0.01522374, -0.02271799, 0.00860278, -0.07424029, 0.00386434]),
+    "BAAI/bge-small-zh-v1.5": np.array([-0.01023294, 0.07634465, 0.0691722, -0.04458365, -0.03160762]),
+    "BAAI/bge-base-en": np.array([0.0115, 0.0372, 0.0295, 0.0121, 0.0346]),
     "BAAI/bge-base-en-v1.5": np.array([0.01129394, 0.05493144, 0.02615099, 0.00328772, 0.02996045]),
-    "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259,  0.0058,  0.0114,  0.0380, -0.0233]),
-    "intfloat/multilingual-e5-large": np.array([0.0098,  0.0045,  0.0066, -0.0354,  0.0070]),
+    "sentence-transformers/all-MiniLM-L6-v2": np.array([0.0259, 0.0058, 0.0114, 0.0380, -0.0233]),
+    "intfloat/multilingual-e5-large": np.array([0.0098, 0.0045, 0.0066, -0.0354, 0.0070]),
+    "jinaai/jina-embeddings-v2-small-en": np.array([-0.0455, -0.0428, -0.0122, 0.0613, 0.0015]),
+    "jinaai/jina-embeddings-v2-base-en": np.array([-0.0332, -0.0509, 0.0287, -0.0043, -0.0077]),
 }
 
 
-def test_default_embedding():
+@pytest.mark.parametrize('embedding_class', [DefaultEmbedding, JinaEmbedding])
+def test_embedding(embedding_class):
     is_ubuntu_ci = os.getenv("IS_UBUNTU_CI")
 
-    for model_desc in Embedding.list_supported_models():
+    for model_desc in embedding_class.list_supported_models():
         if is_ubuntu_ci == "false" and model_desc["size_in_GB"] > 1:
             continue
 
         dim = model_desc["dim"]
-        model = DefaultEmbedding(model_name=model_desc["model"])
+        model = embedding_class(model_name=model_desc["model"])
 
         docs = ["hello world", "flag embedding"]
         embeddings = list(model.embed(docs))
@@ -35,18 +37,20 @@ def test_default_embedding():
         assert np.allclose(embeddings[0, :canonical_vector.shape[0]], canonical_vector, atol=1e-3), model_desc["model"]
 
 
-def test_batch_embedding():
-    model = DefaultEmbedding()
+@pytest.mark.parametrize('n_dims,embedding_class', [(384, DefaultEmbedding), (768, JinaEmbedding)])
+def test_batch_embedding(n_dims, embedding_class):
+    model = embedding_class()
 
     docs = ["hello world", "flag embedding"] * 100
     embeddings = list(model.embed(docs, batch_size=10))
     embeddings = np.stack(embeddings, axis=0)
 
-    assert embeddings.shape == (200, 384)
+    assert embeddings.shape == (200, n_dims)
 
 
-def test_parallel_processing():
-    model = DefaultEmbedding()
+@pytest.mark.parametrize('n_dims,embedding_class', [(384, DefaultEmbedding), (768, JinaEmbedding)])
+def test_parallel_processing(n_dims, embedding_class):
+    model = embedding_class()
 
     docs = ["hello world", "flag embedding"] * 100
     embeddings = list(model.embed(docs, batch_size=10, parallel=2))
@@ -58,6 +62,6 @@ def test_parallel_processing():
     embeddings_3 = list(model.embed(docs, batch_size=10, parallel=0))
     embeddings_3 = np.stack(embeddings_3, axis=0)
 
-    assert embeddings.shape == (200, 384)
+    assert embeddings.shape == (200, n_dims)
     assert np.allclose(embeddings, embeddings_2, atol=1e-3)
     assert np.allclose(embeddings, embeddings_3, atol=1e-3)