From 6a48ace9b951baeb44fa51c2b76b479a596ad8e5 Mon Sep 17 00:00:00 2001 From: Massimiliano Pippi Date: Tue, 15 Nov 2022 09:54:55 +0100 Subject: [PATCH] BREAKING CHANGE: remove Milvus1DocumentStore along with support for Milvus < 2.x (#3552) * remove milvus1 * leftover * revert deprecation process --- .github/workflows/tests.yml | 4 +- README.md | 8 +- conftest.py | 4 +- haystack/document_stores/__init__.py | 5 +- .../document_stores/{milvus2.py => milvus.py} | 16 +- haystack/document_stores/milvus1.py | 698 ------------------ haystack/utils/doc_store.py | 37 - pyproject.toml | 7 - test/conftest.py | 40 +- test/document_stores/test_document_store.py | 38 +- test/document_stores/test_sql_based.py | 26 +- test/nodes/test_retriever.py | 10 +- test/pipelines/test_eval.py | 8 +- test/pipelines/test_standard_pipelines.py | 8 +- 14 files changed, 64 insertions(+), 845 deletions(-) rename haystack/document_stores/{milvus2.py => milvus.py} (97%) delete mode 100644 haystack/document_stores/milvus1.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 07fbb124d0..db6ed614ee 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -437,7 +437,7 @@ jobs: env: TOKENIZERS_PARALLELISM: 'false' run: | - pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not milvus1 and not weaviate and not pinecone and not integration" test/${{ matrix.folder }} --document_store_type=memory + pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration" test/${{ matrix.folder }} --document_store_type=memory - uses: act10ns/slack@v1 with: @@ -483,7 +483,7 @@ jobs: env: TOKENIZERS_PARALLELISM: 'false' run: | - pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not milvus1 and not weaviate and not pinecone and not integration" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory + pytest ${{ env.PYTEST_PARAMS }} -m "not elasticsearch and not faiss and not milvus and not weaviate and not pinecone and not integration" ${{ env.SUITES_EXCLUDED_FROM_WINDOWS }} test/${{ matrix.folder }} --document_store_type=memory - uses: act10ns/slack@v1 with: diff --git a/README.md b/README.md index 8f8575ae1d..c9f44d6420 100644 --- a/README.md +++ b/README.md @@ -58,13 +58,13 @@ Haystack is built in a modular fashion so that you can combine the best technolo | | | | --------------------------------------------------------------------------------------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | -| :ledger: [Docs](https://docs.haystack.deepset.ai) | Components, Pipeline Nodes, Guides, API Reference | +| :ledger: [Docs](https://docs.haystack.deepset.ai) | Components, Pipeline Nodes, Guides, API Reference | | :floppy_disk: [Installation](https://github.com/deepset-ai/haystack#floppy_disk-installation) | How to install Haystack | | :mortar_board: [Tutorials](https://github.com/deepset-ai/haystack#mortar_board-tutorials) | See what Haystack can do with our Notebooks & Scripts | | :beginner: [Quick Demo](https://github.com/deepset-ai/haystack#beginner-quick-demo) | Deploy a Haystack application with Docker Compose and a REST API | | :vulcan_salute: [Community](https://github.com/deepset-ai/haystack#vulcan_salute-community) | [Discord](https://haystack.deepset.ai/community/join), [Twitter](https://twitter.com/deepset_ai), [Stack Overflow](https://stackoverflow.com/questions/tagged/haystack), [GitHub Discussions](https://github.com/deepset-ai/haystack/discussions) | | :heart: [Contributing](https://github.com/deepset-ai/haystack#heart-contributing) | We welcome all contributions! | -| :bar_chart: [Benchmarks](https://haystack.deepset.ai/benchmarks/) | Speed & Accuracy of Retriever, Readers and DocumentStores | +| :bar_chart: [Benchmarks](https://haystack.deepset.ai/benchmarks/) | Speed & Accuracy of Retriever, Readers and DocumentStores | | :telescope: [Roadmap](https://haystack.deepset.ai/overview/roadmap) | Public roadmap of Haystack | | :newspaper: [Blog](https://medium.com/deepset-ai) | Read our articles on Medium | | :phone: [Jobs](https://www.deepset.ai/jobs) | We're hiring! Have a look at our open positions | @@ -96,8 +96,8 @@ pip install -e '.[all]' ## or 'all-gpu' for the GPU-enabled dependencies ``` If you cannot upgrade `pip` to version 21.3 or higher, you will need to replace: -- `'.[all]'` with `'.[sql,only-faiss,only-milvus1,weaviate,graphdb,crawler,preprocessing,ocr,onnx,ray,dev]'` -- `'.[all-gpu]'` with `'.[sql,only-faiss-gpu,only-milvus1,weaviate,graphdb,crawler,preprocessing,ocr,onnx-gpu,ray,dev]'` +- `'.[all]'` with `'.[sql,only-faiss,only-milvus,weaviate,graphdb,crawler,preprocessing,ocr,onnx,ray,dev]'` +- `'.[all-gpu]'` with `'.[sql,only-faiss-gpu,only-milvus,weaviate,graphdb,crawler,preprocessing,ocr,onnx-gpu,ray,dev]'` For an complete list of the dependency groups available, have a look at the `haystack/pyproject.toml` file. diff --git a/conftest.py b/conftest.py index 8bc7705521..5b0b95df45 100644 --- a/conftest.py +++ b/conftest.py @@ -1,8 +1,6 @@ def pytest_addoption(parser): parser.addoption( - "--document_store_type", - action="store", - default="elasticsearch, faiss, sql, memory, milvus1, milvus, weaviate, pinecone", + "--document_store_type", action="store", default="elasticsearch, faiss, sql, memory, milvus, weaviate, pinecone" ) parser.addoption( "--mock-dc", action="store_true", default=True, help="Mock HTTP requests to dC while running tests" diff --git a/haystack/document_stores/__init__.py b/haystack/document_stores/__init__.py index 6ed752c961..a31a863bdd 100644 --- a/haystack/document_stores/__init__.py +++ b/haystack/document_stores/__init__.py @@ -20,10 +20,7 @@ SQLDocumentStore = safe_import("haystack.document_stores.sql", "SQLDocumentStore", "sql") FAISSDocumentStore = safe_import("haystack.document_stores.faiss", "FAISSDocumentStore", "faiss") PineconeDocumentStore = safe_import("haystack.document_stores.pinecone", "PineconeDocumentStore", "pinecone") -if os.getenv("MILVUS1_ENABLED"): - MilvusDocumentStore = safe_import("haystack.document_stores.milvus1", "Milvus1DocumentStore", "milvus1") -else: - MilvusDocumentStore = safe_import("haystack.document_stores.milvus2", "Milvus2DocumentStore", "milvus") +MilvusDocumentStore = safe_import("haystack.document_stores.milvus", "MilvusDocumentStore", "milvus") WeaviateDocumentStore = safe_import("haystack.document_stores.weaviate", "WeaviateDocumentStore", "weaviate") GraphDBKnowledgeGraph = safe_import("haystack.document_stores.graphdb", "GraphDBKnowledgeGraph", "graphdb") InMemoryKnowledgeGraph = safe_import( diff --git a/haystack/document_stores/milvus2.py b/haystack/document_stores/milvus.py similarity index 97% rename from haystack/document_stores/milvus2.py rename to haystack/document_stores/milvus.py index d0f1a70ef0..05d723be92 100644 --- a/haystack/document_stores/milvus2.py +++ b/haystack/document_stores/milvus.py @@ -24,7 +24,7 @@ logger = logging.getLogger(__name__) -class Milvus2DocumentStore(SQLDocumentStore): +class MilvusDocumentStore(SQLDocumentStore): """ Limitations: Milvus 2.0 so far doesn't support the deletion of documents (https://github.com/milvus-io/milvus/issues/7130). @@ -251,7 +251,7 @@ def write_documents( :return: """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") index = index or self.index index_param = index_param or self.index_param @@ -412,7 +412,7 @@ def query_by_embedding( :return: """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") index = index or self.index has_collection = utility.has_collection(collection_name=index) @@ -468,7 +468,7 @@ def delete_documents( :return: None """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") if ids: self._delete_vector_ids_from_milvus(ids=ids, index=index) @@ -529,7 +529,7 @@ def get_all_documents_generator( :param batch_size: When working with large number of documents, batching can help reduce memory footprint. """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") index = index or self.index documents = super().get_all_documents_generator(index=index, filters=filters, batch_size=batch_size) @@ -560,7 +560,7 @@ def get_all_documents( :param batch_size: When working with large number of documents, batching can help reduce memory footprint. """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") index = index or self.index result = self.get_all_documents_generator( @@ -580,7 +580,7 @@ def get_document_by_id( DocumentStore's default index (self.index) will be used. """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") documents = self.get_documents_by_id([id], index) document = documents[0] if documents else None @@ -602,7 +602,7 @@ def get_documents_by_id( :param batch_size: When working with large number of documents, batching can help reduce memory footprint. """ if headers: - raise NotImplementedError("Milvus2DocumentStore does not support headers.") + raise NotImplementedError("MilvusDocumentStore does not support headers.") index = index or self.index documents = super().get_documents_by_id(ids=ids, index=index, batch_size=batch_size) diff --git a/haystack/document_stores/milvus1.py b/haystack/document_stores/milvus1.py deleted file mode 100644 index f91fe3dea3..0000000000 --- a/haystack/document_stores/milvus1.py +++ /dev/null @@ -1,698 +0,0 @@ -from typing import Any, Dict, Generator, List, Optional, Union - -import logging -import warnings -import numpy as np -from tqdm import tqdm - -try: - from milvus import IndexType, MetricType, Milvus, Status - from haystack.document_stores.sql import SQLDocumentStore -except (ImportError, ModuleNotFoundError) as ie: - from haystack.utils.import_utils import _optional_component_not_installed - - _optional_component_not_installed(__name__, "milvus", ie) - -from haystack.schema import Document -from haystack.document_stores.base import get_batches_from_generator -from haystack.nodes.retriever import DenseRetriever - - -logger = logging.getLogger(__name__) - - -class Milvus1DocumentStore(SQLDocumentStore): - """ - Milvus (https://milvus.io/) is a highly reliable, scalable Document Store specialized on storing and processing vectors. - Therefore, it is particularly suited for Haystack users that work with dense retrieval methods (like DPR). - In contrast to FAISS, Milvus ... - - runs as a separate service (e.g. a Docker container) and can scale easily in a distributed environment - - allows dynamic data management (i.e. you can insert/delete vectors without recreating the whole index) - - encapsulates multiple ANN libraries (FAISS, ANNOY ...) - - This class uses Milvus for all vector related storage, processing and querying. - The meta-data (e.g. for filtering) and the document text are however stored in a separate SQL Database as Milvus - does not allow these data types (yet). - - Usage: - 1. Start a Milvus server (see https://milvus.io/docs/v1.0.0/install_milvus.md) - 2. Run pip install farm-haystack[milvus1] - 3. Init a MilvusDocumentStore in Haystack - """ - - def __init__( - self, - sql_url: str = "sqlite:///", - milvus_url: str = "tcp://localhost:19530", - connection_pool: str = "SingletonThread", - index: str = "document", - vector_dim: Optional[int] = None, - embedding_dim: int = 768, - index_file_size: int = 1024, - similarity: str = "dot_product", - index_type: IndexType = IndexType.FLAT, - index_param: Optional[Dict[str, Any]] = None, - search_param: Optional[Dict[str, Any]] = None, - return_embedding: bool = False, - embedding_field: str = "embedding", - progress_bar: bool = True, - duplicate_documents: str = "overwrite", - isolation_level: Optional[str] = None, - ): - """ - **WARNING:** Milvus1DocumentStore is deprecated and will be removed in a future version. Please switch to Milvus2 - or consider using another DocumentStore. - - :param sql_url: SQL connection URL for storing document texts and metadata. It defaults to a local, file based SQLite DB. For large scale - deployment, Postgres is recommended. If using MySQL then same server can also be used for - Milvus metadata. For more details see https://milvus.io/docs/v1.0.0/data_manage.md. - :param milvus_url: Milvus server connection URL for storing and processing vectors. - Protocol, host and port will automatically be inferred from the URL. - See https://milvus.io/docs/v1.0.0/install_milvus.md for instructions to start a Milvus instance. - :param connection_pool: Connection pool type to connect with Milvus server. Default: "SingletonThread". - :param index: Index name for text, embedding and metadata (in Milvus terms, this is the "collection name"). - :param vector_dim: Deprecated. Use embedding_dim instead. - :param embedding_dim: The embedding vector size. Default: 768. - :param index_file_size: Specifies the size of each segment file that is stored by Milvus and its default value is 1024 MB. - When the size of newly inserted vectors reaches the specified volume, Milvus packs these vectors into a new segment. - Milvus creates one index file for each segment. When conducting a vector search, Milvus searches all index files one by one. - As a rule of thumb, we would see a 30% ~ 50% increase in the search performance after changing the value of index_file_size from 1024 to 2048. - Note that an overly large index_file_size value may cause failure to load a segment into the memory or graphics memory. - (From https://milvus.io/docs/v1.0.0/performance_faq.md#How-can-I-get-the-best-performance-from-Milvus-through-setting-index_file_size) - :param similarity: The similarity function used to compare document vectors. 'dot_product' is the default and recommended for DPR embeddings. - 'cosine' is recommended for Sentence Transformers. - :param index_type: Type of approximate nearest neighbour (ANN) index used. The choice here determines your tradeoff between speed and accuracy. - Some popular options: - - FLAT (default): Exact method, slow - - IVF_FLAT, inverted file based heuristic, fast - - HSNW: Graph based, fast - - ANNOY: Tree based, fast - See: https://milvus.io/docs/v1.0.0/index.md - :param index_param: Configuration parameters for the chose index_type needed at indexing time. - For example: {"nlist": 16384} as the number of cluster units to create for index_type IVF_FLAT. - See https://milvus.io/docs/v1.0.0/index.md - :param search_param: Configuration parameters for the chose index_type needed at query time - For example: {"nprobe": 10} as the number of cluster units to query for index_type IVF_FLAT. - See https://milvus.io/docs/v1.0.0/index.md - :param return_embedding: To return document embedding. - :param embedding_field: Name of field containing an embedding vector. - :param progress_bar: Whether to show a tqdm progress bar or not. - Can be helpful to disable in production deployments to keep the logs clean. - :param duplicate_documents: Handle duplicates document based on parameter options. - Parameter options : ( 'skip','overwrite','fail') - skip: Ignore the duplicates documents - overwrite: Update any existing documents with the same ID when adding documents. - fail: an error is raised if the document ID of the document being added already - exists. - :param isolation_level: see SQLAlchemy's `isolation_level` parameter for `create_engine()` (https://docs.sqlalchemy.org/en/14/core/engines.html#sqlalchemy.create_engine.params.isolation_level) - """ - deprecation_message = ( - "Milvus1DocumentStore is deprecated and will be removed in a future version. " - "Please consider switching to Milvus2 or to another DocumentStore." - ) - warnings.warn(message=deprecation_message, category=FutureWarning, stacklevel=3) - - super().__init__( - url=sql_url, index=index, duplicate_documents=duplicate_documents, isolation_level=isolation_level - ) - - self.milvus_server = Milvus(uri=milvus_url, pool=connection_pool) - - if vector_dim is not None: - warnings.warn( - message="The 'vector_dim' parameter is deprecated, use 'embedding_dim' instead.", - category=DeprecationWarning, - stacklevel=2, - ) - self.embedding_dim = vector_dim - else: - self.embedding_dim = embedding_dim - - self.index_file_size = index_file_size - - if similarity in ("dot_product", "cosine"): - self.metric_type = MetricType.IP - self.similarity = similarity - elif similarity == "l2": - self.metric_type = MetricType.L2 - self.similarity = similarity - else: - raise ValueError( - "The Milvus document store can currently only support dot_product, cosine and L2 similarity. " - 'Please set similarity="dot_product", "cosine" or "l2"' - ) - - self.index_type = index_type - self.index_param = index_param or {"nlist": 16384} - self.search_param = search_param or {"nprobe": 10} - self.index = index - self._create_collection_and_index_if_not_exist(self.index) - self.return_embedding = return_embedding - self.embedding_field = embedding_field - self.progress_bar = progress_bar - - def __del__(self): - return self.milvus_server.close() - - def _create_collection_and_index_if_not_exist( - self, index: Optional[str] = None, index_param: Optional[Dict[str, Any]] = None - ): - index = index or self.index - index_param = index_param or self.index_param - - status, ok = self.milvus_server.has_collection(collection_name=index) - if not ok: - collection_param = { - "collection_name": index, - "dimension": self.embedding_dim, - "index_file_size": self.index_file_size, - "metric_type": self.metric_type, - } - - status = self.milvus_server.create_collection(collection_param) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Collection creation on Milvus server failed: {status}") - - status = self.milvus_server.create_index(index, self.index_type, index_param) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Index creation on Milvus server failed: {status}") - - def _create_document_field_map(self) -> Dict: - return {self.index: self.embedding_field} - - def write_documents( - self, - documents: Union[List[dict], List[Document]], - index: Optional[str] = None, - batch_size: int = 10_000, - duplicate_documents: Optional[str] = None, - headers: Optional[Dict[str, str]] = None, - index_param: Optional[Dict[str, Any]] = None, - ): - """ - Add new documents to the DocumentStore. - - :param documents: List of `Dicts` or List of `Documents`. If they already contain the embeddings, we'll index - them right away in Milvus. If not, you can later call update_embeddings() to create & index them. - :param index: (SQL) index name for storing the docs and metadata - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - :param duplicate_documents: Handle duplicates document based on parameter options. - Parameter options : ( 'skip','overwrite','fail') - skip: Ignore the duplicates documents - overwrite: Update any existing documents with the same ID when adding documents. - fail: an error is raised if the document ID of the document being added already - exists. - :raises DuplicateDocumentError: Exception trigger on duplicate document - :return: None - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - index = index or self.index - index_param = index_param or self.index_param - duplicate_documents = duplicate_documents or self.duplicate_documents - assert ( - duplicate_documents in self.duplicate_documents_options - ), f"duplicate_documents parameter must be {', '.join(self.duplicate_documents_options)}" - self._create_collection_and_index_if_not_exist(index) - field_map = self._create_document_field_map() - - if len(documents) == 0: - logger.warning("Calling DocumentStore.write_documents() with empty list") - return - - document_objects = [Document.from_dict(d, field_map=field_map) if isinstance(d, dict) else d for d in documents] - document_objects = self._handle_duplicate_documents( - documents=document_objects, index=index, duplicate_documents=duplicate_documents - ) - add_vectors = False if document_objects[0].embedding is None else True - - batched_documents = get_batches_from_generator(document_objects, batch_size) - with tqdm(total=len(document_objects), disable=not self.progress_bar) as progress_bar: - for document_batch in batched_documents: - vector_ids = [] - if add_vectors: - doc_ids = [] - embeddings = [] - for doc in document_batch: - doc_ids.append(doc.id) - if isinstance(doc.embedding, np.ndarray): - if self.similarity == "cosine": - self.normalize_embedding(doc.embedding) - embeddings.append(doc.embedding.tolist()) - elif isinstance(doc.embedding, list): - if self.similarity == "cosine": - # temp conversion to ndarray - np_embedding = np.array(doc.embedding, dtype="float32") - self.normalize_embedding(np_embedding) - embeddings.append(np_embedding.tolist()) - else: - embeddings.append(doc.embedding) - else: - raise AttributeError( - f"Format of supplied document embedding {type(doc.embedding)} is not " - f"supported. Please use list or numpy.ndarray" - ) - - if duplicate_documents == "overwrite": - existing_docs = super().get_documents_by_id(ids=doc_ids, index=index, batch_size=batch_size) - self._delete_vector_ids_from_milvus(documents=existing_docs, index=index) - - status, vector_ids = self.milvus_server.insert(collection_name=index, records=embeddings) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Vector embedding insertion failed: {status}") - - docs_to_write_in_sql = [] - for idx, doc in enumerate(document_batch): - meta = doc.meta - if add_vectors: - meta["vector_id"] = vector_ids[idx] - docs_to_write_in_sql.append(doc) - - super().write_documents(docs_to_write_in_sql, index=index, duplicate_documents=duplicate_documents) - progress_bar.update(batch_size) - progress_bar.close() - - self.milvus_server.flush([index]) - if duplicate_documents == "overwrite": - self.milvus_server.compact(collection_name=index) - - # Milvus index creating should happen after the creation of the collection and after the insertion - # of documents for maximum efficiency. - # See (https://github.com/milvus-io/milvus/discussions/4939#discussioncomment-809303) - status = self.milvus_server.create_index(index, self.index_type, index_param) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Index creation on Milvus server failed: {status}") - - def update_embeddings( - self, - retriever: DenseRetriever, - index: Optional[str] = None, - batch_size: int = 10_000, - update_existing_embeddings: bool = True, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - ): - """ - Updates the embeddings in the the document store using the encoding model specified in the retriever. - This can be useful if want to add or change the embeddings for your documents (e.g. after changing the retriever config). - - :param retriever: Retriever to use to get embeddings for text - :param index: (SQL) index name for storing the docs and metadata - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - :param update_existing_embeddings: Whether to update existing embeddings of the documents. If set to False, - only documents without embeddings are processed. This mode can be used for - incremental updating of embeddings, wherein, only newly indexed documents - get processed. - :param filters: Optional filters to narrow down the documents for which embeddings are to be updated. - Example: {"name": ["some", "more"], "category": ["only_one"]} - :return: None - """ - index = index or self.index - self._create_collection_and_index_if_not_exist(index) - - result = self._query( - index=index, - vector_ids=None, - batch_size=batch_size, - filters=filters, - only_documents_without_embedding=not update_existing_embeddings, - ) - - document_count = len(result) - if document_count == 0: - logger.warning("Calling DocumentStore.update_embeddings() on an empty index") - return - - logger.info("Updating embeddings for %s docs...", document_count) - - batched_documents = get_batches_from_generator(result, batch_size) - with tqdm( - total=document_count, disable=not self.progress_bar, position=0, unit=" docs", desc="Updating Embedding" - ) as progress_bar: - for document_batch in batched_documents: - self._delete_vector_ids_from_milvus(documents=document_batch, index=index) - - embeddings = retriever.embed_documents(document_batch) - self._validate_embeddings_shape( - embeddings=embeddings, num_documents=len(document_batch), embedding_dim=self.embedding_dim - ) - - if self.similarity == "cosine": - self.normalize_embedding(embeddings) - - status, vector_ids = self.milvus_server.insert(collection_name=index, records=embeddings.tolist()) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Vector embedding insertion failed: {status}") - - vector_id_map = {} - for vector_id, doc in zip(vector_ids, document_batch): - vector_id_map[doc.id] = vector_id - - self.update_vector_ids(vector_id_map, index=index) - progress_bar.set_description_str("Documents Processed") - progress_bar.update(batch_size) - - self.milvus_server.flush([index]) - self.milvus_server.compact(collection_name=index) - - def query_by_embedding( - self, - query_emb: np.ndarray, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - top_k: int = 10, - index: Optional[str] = None, - return_embedding: Optional[bool] = None, - headers: Optional[Dict[str, str]] = None, - scale_score: bool = True, - ) -> List[Document]: - """ - Find the document that is most similar to the provided `query_emb` by using a vector similarity metric. - - :param query_emb: Embedding of the query (e.g. gathered from DPR) - :param filters: Optional filters to narrow down the search space. - Example: {"name": ["some", "more"], "category": ["only_one"]} - :param top_k: How many documents to return - :param index: (SQL) index name for storing the docs and metadata - :param return_embedding: To return document embedding - :param scale_score: Whether to scale the similarity score to the unit interval (range of [0,1]). - If true (default) similarity scores (e.g. cosine or dot_product) which naturally have a different value range will be scaled to a range of [0,1], where 1 means extremely relevant. - Otherwise raw similarity scores (e.g. cosine or dot_product) will be used. - :return: list of Documents that are the most similar to `query_emb` - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - if filters: - logger.warning("Query filters are not implemented for the MilvusDocumentStore.") - - index = index or self.index - status, ok = self.milvus_server.has_collection(collection_name=index) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Milvus has collection check failed: {status}") - if not ok: - raise Exception("No index exists. Use 'update_embeddings()` to create an index.") - - if return_embedding is None: - return_embedding = self.return_embedding - index = index or self.index - - if self.similarity == "cosine": - self.normalize_embedding(query_emb) - - query_emb = query_emb.reshape(1, -1).astype(np.float32) - - status, search_result = self.milvus_server.search( - collection_name=index, query_records=query_emb, top_k=top_k, params=self.search_param - ) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Vector embedding search failed: {status}") - - vector_ids_for_query = [] - scores_for_vector_ids: Dict[str, float] = {} - for vector_id_list, distance_list in zip(search_result.id_array, search_result.distance_array): - for vector_id, distance in zip(vector_id_list, distance_list): - vector_ids_for_query.append(str(vector_id)) - scores_for_vector_ids[str(vector_id)] = distance - - documents = self.get_documents_by_vector_ids(vector_ids_for_query, index=index) - - if return_embedding: - self._populate_embeddings_to_docs(index=index, docs=documents) - - for doc in documents: - score = scores_for_vector_ids[doc.meta["vector_id"]] - if scale_score: - score = self.scale_to_unit_interval(score, self.similarity) - doc.score = score - - return documents - - def delete_all_documents( - self, - index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - headers: Optional[Dict[str, str]] = None, - ): - """ - Delete all documents (from SQL AND Milvus). - :param index: (SQL) index name for storing the docs and metadata - :param filters: Optional filters to narrow down the search space. - Example: {"name": ["some", "more"], "category": ["only_one"]} - :return: None - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - logger.warning( - """DEPRECATION WARNINGS: - 1. delete_all_documents() method is deprecated, please use delete_documents method - For more details, please refer to the issue: https://github.com/deepset-ai/haystack/issues/1045 - """ - ) - self.delete_documents(index, None, filters) - - def delete_documents( - self, - index: Optional[str] = None, - ids: Optional[List[str]] = None, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - headers: Optional[Dict[str, str]] = None, - ): - """ - Delete documents in an index. All documents are deleted if no filters are passed. - - :param index: Index name to delete the document from. If None, the - DocumentStore's default index (self.index) will be used. - :param ids: Optional list of IDs to narrow down the documents to be deleted. - :param filters: Optional filters to narrow down the documents to be deleted. - Example filters: {"name": ["some", "more"], "category": ["only_one"]}. - If filters are provided along with a list of IDs, this method deletes the - intersection of the two query results (documents that match the filters and - have their ID in the list). - :return: None - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - index = index or self.index - status, ok = self.milvus_server.has_collection(collection_name=index) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Milvus has collection check failed: {status}") - if ok: - if not filters and not ids: - status = self.milvus_server.drop_collection(collection_name=index) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Milvus drop collection failed: {status}") - else: - affected_docs = super().get_all_documents(filters=filters, index=index) - if ids: - affected_docs = [doc for doc in affected_docs if doc.id in ids] - self._delete_vector_ids_from_milvus(documents=affected_docs, index=index) - - self.milvus_server.flush([index]) - self.milvus_server.compact(collection_name=index) - - # Delete from SQL at the end to allow the above .get_all_documents() to work properly - super().delete_documents(index=index, ids=ids, filters=filters) - - def delete_index(self, index: str): - """ - Delete an existing index. The index including all data will be removed. - - :param index: The name of the index to delete. - :return: None - """ - if index == self.index: - logger.warning( - f"Deletion of default index '{index}' detected. " - f"If you plan to use this index again, please reinstantiate '{self.__class__.__name__}' in order to avoid side-effects." - ) - self.milvus_server.drop_collection(index) - super().delete_index(index) - - def get_all_documents_generator( - self, - index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - return_embedding: Optional[bool] = None, - batch_size: int = 10_000, - headers: Optional[Dict[str, str]] = None, - ) -> Generator[Document, None, None]: - """ - Get all documents from the document store. Under-the-hood, documents are fetched in batches from the - document store and yielded as individual documents. This method can be used to iteratively process - a large number of documents without having to load all documents in memory. - - :param index: Name of the index to get the documents from. If None, the - DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} - :param return_embedding: Whether to return the document embeddings. - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - index = index or self.index - documents = super().get_all_documents_generator(index=index, filters=filters, batch_size=batch_size) - if return_embedding is None: - return_embedding = self.return_embedding - - for doc in documents: - if return_embedding: - self._populate_embeddings_to_docs(index=index, docs=[doc]) - yield doc - - def get_all_documents( - self, - index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - return_embedding: Optional[bool] = None, - batch_size: int = 10_000, - headers: Optional[Dict[str, str]] = None, - ) -> List[Document]: - """ - Get documents from the document store (optionally using filter criteria). - - :param index: Name of the index to get the documents from. If None, the - DocumentStore's default index (self.index) will be used. - :param filters: Optional filters to narrow down the documents to return. - Example: {"name": ["some", "more"], "category": ["only_one"]} - :param return_embedding: Whether to return the document embeddings. - :param batch_size: When working with large number of documents, batching can help reduce memory footprint. - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - index = index or self.index - result = self.get_all_documents_generator( - index=index, filters=filters, return_embedding=return_embedding, batch_size=batch_size - ) - documents = list(result) - return documents - - def get_document_by_id( - self, id: str, index: Optional[str] = None, headers: Optional[Dict[str, str]] = None - ) -> Optional[Document]: - """ - Fetch a document by specifying its text id string - - :param id: ID of the document - :param index: Name of the index to get the documents from. If None, the - DocumentStore's default index (self.index) will be used. - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - documents = self.get_documents_by_id([id], index) - document = documents[0] if documents else None - return document - - def get_documents_by_id( - self, - ids: List[str], - index: Optional[str] = None, - batch_size: int = 10_000, - headers: Optional[Dict[str, str]] = None, - ) -> List[Document]: - """ - Fetch multiple documents by specifying their IDs (strings) - - :param ids: List of IDs of the documents - :param index: Name of the index to get the documents from. If None, the - DocumentStore's default index (self.index) will be used. - :param batch_size: is currently not used - """ - if headers: - raise NotImplementedError("MilvusDocumentStore does not support headers.") - - index = index or self.index - documents = super().get_documents_by_id(ids=ids, index=index, batch_size=batch_size) - if self.return_embedding: - self._populate_embeddings_to_docs(index=index, docs=documents) - - return documents - - def _populate_embeddings_to_docs(self, docs: List[Document], index: Optional[str] = None): - index = index or self.index - docs_with_vector_ids = [] - for doc in docs: - if doc.meta and doc.meta.get("vector_id") is not None: - docs_with_vector_ids.append(doc) - - if len(docs_with_vector_ids) == 0: - return - - ids = [int(doc.meta.get("vector_id")) for doc in docs_with_vector_ids] # type: ignore - status, vector_embeddings = self.milvus_server.get_entity_by_id(collection_name=index, ids=ids) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Getting vector embedding by id failed: {status}") - - for embedding, doc in zip(vector_embeddings, docs_with_vector_ids): - doc.embedding = np.array(embedding, dtype="float32") - - def _delete_vector_ids_from_milvus(self, documents: List[Document], index: Optional[str] = None): - index = index or self.index - existing_vector_ids = [] - for doc in documents: - if "vector_id" in doc.meta: - existing_vector_ids.append(int(doc.meta["vector_id"])) - if len(existing_vector_ids) > 0: - status = self.milvus_server.delete_entity_by_id(collection_name=index, id_array=existing_vector_ids) - if status.code != Status.SUCCESS: - raise RuntimeError(f"Existing vector ids deletion failed: {status}") - - def get_all_vectors(self, index: Optional[str] = None) -> List[np.ndarray]: - """ - Helper function to dump all vectors stored in Milvus server. - - :param index: Name of the index to get the documents from. If None, the - DocumentStore's default index (self.index) will be used. - :return: List[np.array]: List of vectors. - """ - index = index or self.index - status, collection_info = self.milvus_server.get_collection_stats(collection_name=index) - if not status.OK(): - logger.info("Failed fetch stats from store ...") - return list() - - logger.debug("collection_info = %s", collection_info) - - ids = list() - partition_list = collection_info["partitions"] - for partition in partition_list: - segment_list = partition["segments"] - for segment in segment_list: - segment_name = segment["name"] - status, id_list = self.milvus_server.list_id_in_segment( - collection_name=index, segment_name=segment_name - ) - logger.debug("%s: segment %s has %s vectors ...", status, segment_name, len(id_list)) - ids.extend(id_list) - - if len(ids) == 0: - logger.info("No documents in the store ...") - return list() - - status, vectors = self.milvus_server.get_entity_by_id(collection_name=index, ids=ids) - if not status.OK(): - logger.info("Failed fetch document for ids %s from store ...", ids) - return list() - - return vectors - - def get_embedding_count( - self, - index: Optional[str] = None, - filters: Optional[Dict[str, Any]] = None, # TODO: Adapt type once we allow extended filters in MilvusDocStore - ) -> int: - """ - Return the count of embeddings in the document store. - """ - if filters: - raise Exception("filters are not supported for get_embedding_count in MilvusDocumentStore.") - index = index or self.index - _, embedding_count = self.milvus_server.count_entities(index) - if embedding_count is None: - embedding_count = 0 - return embedding_count diff --git a/haystack/utils/doc_store.py b/haystack/utils/doc_store.py index 020e49670f..ccdb715542 100644 --- a/haystack/utils/doc_store.py +++ b/haystack/utils/doc_store.py @@ -11,7 +11,6 @@ logger = logging.getLogger(__name__) ELASTICSEARCH_CONTAINER_NAME = "elasticsearch" OPENSEARCH_CONTAINER_NAME = "opensearch" -MILVUS1_CONTAINER_NAME = "milvus1" WEAVIATE_CONTAINER_NAME = "weaviate" @@ -103,10 +102,6 @@ def stop_elasticsearch(delete_container=False): stop_container(ELASTICSEARCH_CONTAINER_NAME, delete_container) -def stop_milvus(delete_container=False): - stop_container(MILVUS1_CONTAINER_NAME, delete_container) - - def stop_weaviate(delete_container=False): stop_container(WEAVIATE_CONTAINER_NAME, delete_container) @@ -117,8 +112,6 @@ def stop_service(document_store, delete_container=False): stop_opensearch(delete_container) elif "ElasticsearchDocumentStore" in ds_class: stop_elasticsearch(delete_container) - elif "MilvusDocumentStore" in ds_class: - stop_milvus(delete_container) elif "WeaviateDocumentStore" in ds_class: stop_weaviate(delete_container) else: @@ -149,33 +142,3 @@ def launch_milvus(sleep=15, delete_existing=False): ) else: time.sleep(sleep) - - -def launch_milvus1(sleep=15): - """ - Start a Milvus (version <2.0.0) server via Docker - """ - - logger.debug("Starting Milvus ...") - logger.warning( - "Automatic Milvus config creation not yet implemented. " - "If you are starting Milvus using launch_milvus(), " - "make sure you have a properly populated milvus/conf folder. " - "See (https://milvus.io/docs/v1.0.0/milvus_docker-cpu.md) for more details." - ) - status = subprocess.run( - [ - f"docker start {MILVUS1_CONTAINER_NAME} > /dev/null 2>&1 || docker run -d --name {MILVUS1_CONTAINER_NAME} \ - -p 19530:19530 \ - -p 19121:19121 \ - milvusdb/milvus:1.1.0-cpu-d050721-5e559c" - ], - shell=True, - ) - if status.returncode: - logger.warning( - "Tried to start Milvus through Docker but this failed. " - "It is likely that there is already an existing Milvus instance running. " - ) - else: - time.sleep(sleep) diff --git a/pyproject.toml b/pyproject.toml index 04a9f6d0b9..f735d368c6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -111,12 +111,6 @@ only-faiss-gpu = [ faiss-gpu = [ "farm-haystack[sql,only-faiss-gpu]", ] -only-milvus1 = [ - "pymilvus<2.0.0", # Refer milvus version support matrix at https://github.com/milvus-io/pymilvus#install-pymilvus -] -milvus1 = [ - "farm-haystack[sql,only-milvus1]", -] only-milvus = [ "pymilvus>=2.0.0,<3", # Refer milvus version support matrix at https://github.com/milvus-io/pymilvus#install-pymilvus ] @@ -346,7 +340,6 @@ markers = [ "pinecone: requires Pinecone credentials", "faiss: uses FAISS", "milvus: requires a Milvus 2 setup", - "milvus1: requires a Milvus 1 container", "opensearch", "document_store", ] diff --git a/test/conftest.py b/test/conftest.py index a00edc490c..b6fcfc3c21 100644 --- a/test/conftest.py +++ b/test/conftest.py @@ -73,13 +73,6 @@ except (ImportError, ModuleNotFoundError) as ie: _optional_component_not_installed("test", "test", ie) -try: - from milvus import Milvus - - milvus1 = True -except ImportError: - milvus1 = False - from .mocks import pinecone as pinecone_mock @@ -141,7 +134,7 @@ def pytest_collection_modifyitems(config, items): "ocr": [pytest.mark.ocr, pytest.mark.integration], "elasticsearch": [pytest.mark.elasticsearch], "faiss": [pytest.mark.faiss], - "milvus": [pytest.mark.milvus, pytest.mark.milvus1], + "milvus": [pytest.mark.milvus], "weaviate": [pytest.mark.weaviate], "pinecone": [pytest.mark.pinecone], # FIXME GraphDB can't be treated as a regular docstore, it fails most of their tests @@ -173,14 +166,6 @@ def pytest_collection_modifyitems(config, items): ) item.add_marker(skip_docstore) - if "milvus1" == required_doc_store and not milvus1: - skip_milvus1 = pytest.mark.skip(reason="Skipping Tests for 'milvus1', as Milvus2 seems to be installed.") - item.add_marker(skip_milvus1) - - elif "milvus" == required_doc_store and milvus1: - skip_milvus = pytest.mark.skip(reason="Skipping Tests for 'milvus', as Milvus1 seems to be installed.") - item.add_marker(skip_milvus) - def infer_required_doc_store(item, keywords): # assumption: a test runs only with one document_store @@ -189,7 +174,7 @@ def infer_required_doc_store(item, keywords): # 2. if the test name contains the docstore name, we use that # 3. use an arbitrary one by calling set.pop() required_doc_store = None - all_doc_stores = {"elasticsearch", "faiss", "sql", "memory", "milvus1", "milvus", "weaviate", "pinecone"} + all_doc_stores = {"elasticsearch", "faiss", "sql", "memory", "milvus", "weaviate", "pinecone"} docstore_markers = set(keywords).intersection(all_doc_stores) if len(docstore_markers) > 1: # if parameterized infer the docstore from the parameter @@ -822,7 +807,7 @@ def mock_pinecone(monkeypatch): monkeypatch.setattr(f"pinecone.{cname}", class_, raising=False) -@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"]) +@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"]) def document_store_with_docs(request, docs, tmp_path, monkeypatch): if request.param == "pinecone": mock_pinecone(monkeypatch) @@ -849,7 +834,7 @@ def document_store(request, tmp_path, monkeypatch: pytest.MonkeyPatch): document_store.delete_index(document_store.index) -@pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone"]) +@pytest.fixture(params=["memory", "faiss", "milvus", "elasticsearch", "pinecone"]) def document_store_dot_product(request, tmp_path, monkeypatch): if request.param == "pinecone": mock_pinecone(monkeypatch) @@ -865,7 +850,7 @@ def document_store_dot_product(request, tmp_path, monkeypatch): document_store.delete_index(document_store.index) -@pytest.fixture(params=["memory", "faiss", "milvus1", "milvus", "elasticsearch", "pinecone", "weaviate"]) +@pytest.fixture(params=["memory", "faiss", "milvus", "elasticsearch", "pinecone", "weaviate"]) def document_store_dot_product_with_docs(request, docs, tmp_path, monkeypatch): if request.param == "pinecone": mock_pinecone(monkeypatch) @@ -882,7 +867,7 @@ def document_store_dot_product_with_docs(request, docs, tmp_path, monkeypatch): document_store.delete_index(document_store.index) -@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "pinecone"]) +@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "pinecone"]) def document_store_dot_product_small(request, tmp_path, monkeypatch): if request.param == "pinecone": mock_pinecone(monkeypatch) @@ -898,7 +883,7 @@ def document_store_dot_product_small(request, tmp_path, monkeypatch): document_store.delete_index(document_store.index) -@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"]) +@pytest.fixture(params=["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"]) def document_store_small(request, tmp_path, monkeypatch): if request.param == "pinecone": mock_pinecone(monkeypatch) @@ -998,17 +983,6 @@ def get_document_store( isolation_level="AUTOCOMMIT", ) - elif document_store_type == "milvus1": - document_store = MilvusDocumentStore( - embedding_dim=embedding_dim, - sql_url=get_sql_url(tmp_path), - return_embedding=True, - embedding_field=embedding_field, - index=index, - similarity=similarity, - isolation_level="AUTOCOMMIT", - ) - elif document_store_type == "milvus": document_store = MilvusDocumentStore( embedding_dim=embedding_dim, diff --git a/test/document_stores/test_document_store.py b/test/document_stores/test_document_store.py index 8b2045899c..2b51665783 100644 --- a/test/document_stores/test_document_store.py +++ b/test/document_stores/test_document_store.py @@ -72,7 +72,7 @@ def test_write_with_duplicate_doc_ids(document_store: BaseDocumentStore): @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "weaviate", "pinecone"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True ) def test_write_with_duplicate_doc_ids_custom_index(document_store: BaseDocumentStore): duplicate_documents = [ @@ -392,9 +392,7 @@ def test_write_document_index(document_store: BaseDocumentStore): assert len(document_store.get_all_documents()) == 0 -@pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True -) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True) def test_document_with_embeddings(document_store: BaseDocumentStore): documents = [ {"content": "text1", "id": "1", "embedding": np.random.rand(768).astype(np.float32)}, @@ -414,9 +412,7 @@ def test_document_with_embeddings(document_store: BaseDocumentStore): assert isinstance(documents_with_embedding[0].embedding, (list, np.ndarray)) -@pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate"], indirect=True -) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate"], indirect=True) @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) def test_update_embeddings(document_store, retriever): documents = [] @@ -635,7 +631,7 @@ def test_delete_documents_by_id_with_filters(document_store_with_docs): # exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "pinecone"], indirect=True) def test_labels(document_store: BaseDocumentStore): label = Label( query="question1", @@ -751,7 +747,7 @@ def test_labels_with_long_texts(document_store: BaseDocumentStore): # exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "pinecone"], indirect=True) def test_multilabel(document_store: BaseDocumentStore): labels = [ Label( @@ -852,7 +848,7 @@ def test_multilabel(document_store: BaseDocumentStore): # exclude weaviate because it does not support storing labels -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1", "pinecone"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus", "pinecone"], indirect=True) def test_multilabel_no_answer(document_store: BaseDocumentStore): labels = [ Label( @@ -1088,7 +1084,7 @@ def test_multilabel_meta_aggregations(document_store: BaseDocumentStore): @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "milvus1", "weaviate", "pinecone", "memory"], indirect=True + "document_store", ["elasticsearch", "faiss", "milvus", "weaviate", "pinecone", "memory"], indirect=True ) def test_update_meta(document_store: BaseDocumentStore): documents = [ @@ -1148,7 +1144,7 @@ def test_get_meta_values_by_key(document_store: BaseDocumentStore): @pytest.mark.parametrize( - "document_store_with_docs", ["memory", "faiss", "milvus1", "weaviate", "elasticsearch"], indirect=True + "document_store_with_docs", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"], indirect=True ) @pytest.mark.embedding_dim(384) def test_similarity_score_sentence_transformers(document_store_with_docs): @@ -1172,7 +1168,7 @@ def test_similarity_score_sentence_transformers(document_store_with_docs): @pytest.mark.parametrize( - "document_store_with_docs", ["memory", "faiss", "milvus1", "weaviate", "elasticsearch"], indirect=True + "document_store_with_docs", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"], indirect=True ) @pytest.mark.embedding_dim(384) def test_similarity_score(document_store_with_docs): @@ -1191,7 +1187,7 @@ def test_similarity_score(document_store_with_docs): @pytest.mark.parametrize( - "document_store_with_docs", ["memory", "faiss", "milvus1", "weaviate", "elasticsearch"], indirect=True + "document_store_with_docs", ["memory", "faiss", "milvus", "weaviate", "elasticsearch"], indirect=True ) @pytest.mark.embedding_dim(384) def test_similarity_score_without_scaling(document_store_with_docs): @@ -1212,7 +1208,7 @@ def test_similarity_score_without_scaling(document_store_with_docs): @pytest.mark.parametrize( - "document_store_dot_product_with_docs", ["memory", "faiss", "milvus1", "elasticsearch", "weaviate"], indirect=True + "document_store_dot_product_with_docs", ["memory", "faiss", "milvus", "elasticsearch", "weaviate"], indirect=True ) @pytest.mark.embedding_dim(384) def test_similarity_score_dot_product(document_store_dot_product_with_docs): @@ -1231,7 +1227,7 @@ def test_similarity_score_dot_product(document_store_dot_product_with_docs): @pytest.mark.parametrize( - "document_store_dot_product_with_docs", ["memory", "faiss", "milvus1", "elasticsearch", "weaviate"], indirect=True + "document_store_dot_product_with_docs", ["memory", "faiss", "milvus", "elasticsearch", "weaviate"], indirect=True ) @pytest.mark.embedding_dim(384) def test_similarity_score_dot_product_without_scaling(document_store_dot_product_with_docs): @@ -1312,7 +1308,7 @@ def test_elasticsearch_brownfield_support(document_store_with_docs): @pytest.mark.parametrize( - "document_store", ["faiss", "milvus1", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True + "document_store", ["faiss", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True ) def test_cosine_similarity(document_store: BaseDocumentStore): # below we will write documents to the store and then query it to see if vectors were normalized or not @@ -1337,7 +1333,6 @@ def test_cosine_similarity(document_store: BaseDocumentStore): # embeddings of document stores which only support dot product out of the box must be normalized if ( isinstance(document_store, (FAISSDocumentStore, MilvusDocumentStore, WeaviateDocumentStore)) - or type(document_store).name == "Milvus1DocumentStore" or isinstance(document_store, OpenSearchDocumentStore) and document_store.knn_engine == "faiss" ): @@ -1354,7 +1349,7 @@ def test_cosine_similarity(document_store: BaseDocumentStore): @pytest.mark.parametrize( - "document_store", ["faiss", "milvus1", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True + "document_store", ["faiss", "milvus", "weaviate", "opensearch", "elasticsearch", "memory"], indirect=True ) def test_update_embeddings_cosine_similarity(document_store: BaseDocumentStore): # below we will write documents to the store and then query it to see if vectors were normalized @@ -1396,7 +1391,6 @@ def embed_documents(self, docs): # embeddings of document stores which only support dot product out of the box must be normalized if ( isinstance(document_store, (FAISSDocumentStore, MilvusDocumentStore, WeaviateDocumentStore)) - or type(document_store).name == "Milvus1DocumentStore" or isinstance(document_store, OpenSearchDocumentStore) and document_store.knn_engine == "faiss" ): @@ -1413,9 +1407,7 @@ def embed_documents(self, docs): @pytest.mark.parametrize( - "document_store_small", - ["faiss", "milvus1", "milvus", "weaviate", "memory", "elasticsearch", "opensearch"], - indirect=True, + "document_store_small", ["faiss", "milvus", "weaviate", "memory", "elasticsearch", "opensearch"], indirect=True ) def test_cosine_sanity_check(document_store_small): VEC_1 = np.array([0.1, 0.2, 0.3], dtype="float32") diff --git a/test/document_stores/test_sql_based.py b/test/document_stores/test_sql_based.py index 6cf73f931e..45c857ae6e 100644 --- a/test/document_stores/test_sql_based.py +++ b/test/document_stores/test_sql_based.py @@ -45,7 +45,7 @@ @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) @pytest.mark.parametrize("batch_size", [4, 6]) def test_update_docs(document_store, retriever, batch_size): # initial write @@ -68,7 +68,7 @@ def test_update_docs(document_store, retriever, batch_size): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["milvus1", "milvus", "faiss"], indirect=True) +@pytest.mark.parametrize("document_store", ["milvus", "faiss"], indirect=True) def test_update_existing_docs(document_store, retriever): document_store.duplicate_documents = "overwrite" old_document = Document(content="text_1") @@ -96,7 +96,7 @@ def test_update_existing_docs(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_update_with_empty_store(document_store, retriever): # Call update with empty doc store document_store.update_embeddings(retriever=retriever) @@ -111,7 +111,7 @@ def test_update_with_empty_store(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_finding(document_store, retriever): document_store.write_documents(DOCUMENTS) pipe = DocumentSearchPipeline(retriever=retriever) @@ -123,7 +123,7 @@ def test_finding(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_delete_docs_with_filters_multivalue(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -139,7 +139,7 @@ def test_delete_docs_with_filters_multivalue(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_delete_docs_with_filters(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -155,7 +155,7 @@ def test_delete_docs_with_filters(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_delete_docs_with_many_filters(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -171,7 +171,7 @@ def test_delete_docs_with_many_filters(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_delete_docs_by_id(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -191,7 +191,7 @@ def test_delete_docs_by_id(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_delete_docs_by_id_with_filters(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -217,7 +217,7 @@ def test_delete_docs_by_id_with_filters(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_get_docs_with_filters_one_value(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -231,7 +231,7 @@ def test_get_docs_with_filters_one_value(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_get_docs_with_filters_many_values(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -245,7 +245,7 @@ def test_get_docs_with_filters_many_values(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_get_docs_with_many_filters(document_store, retriever): document_store.write_documents(DOCUMENTS) document_store.update_embeddings(retriever=retriever, batch_size=4) @@ -261,7 +261,7 @@ def test_get_docs_with_many_filters(document_store, retriever): @pytest.mark.integration @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) -@pytest.mark.parametrize("document_store", ["faiss", "milvus1", "milvus"], indirect=True) +@pytest.mark.parametrize("document_store", ["faiss", "milvus"], indirect=True) def test_pipeline(document_store, retriever): documents = [ {"name": "name_1", "content": "text_1", "embedding": np.random.rand(768).astype(np.float32)}, diff --git a/test/nodes/test_retriever.py b/test/nodes/test_retriever.py index 822dae21b7..94919482af 100644 --- a/test/nodes/test_retriever.py +++ b/test/nodes/test_retriever.py @@ -36,15 +36,15 @@ ("mdr", "elasticsearch"), ("mdr", "faiss"), ("mdr", "memory"), - ("mdr", "milvus1"), + ("mdr", "milvus"), ("dpr", "elasticsearch"), ("dpr", "faiss"), ("dpr", "memory"), - ("dpr", "milvus1"), + ("dpr", "milvus"), ("embedding", "elasticsearch"), ("embedding", "faiss"), ("embedding", "memory"), - ("embedding", "milvus1"), + ("embedding", "milvus"), ("elasticsearch", "elasticsearch"), ("es_filter_only", "elasticsearch"), ("tfidf", "memory"), @@ -215,7 +215,7 @@ def test_elasticsearch_custom_query(): @pytest.mark.integration @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True ) @pytest.mark.parametrize("retriever", ["dpr"], indirect=True) def test_dpr_embedding(document_store: BaseDocumentStore, retriever, docs_with_ids): @@ -239,7 +239,7 @@ def test_dpr_embedding(document_store: BaseDocumentStore, retriever, docs_with_i @pytest.mark.integration @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True ) @pytest.mark.parametrize("retriever", ["retribert"], indirect=True) @pytest.mark.embedding_dim(128) diff --git a/test/pipelines/test_eval.py b/test/pipelines/test_eval.py index 037c165cef..9ff14dc50c 100644 --- a/test/pipelines/test_eval.py +++ b/test/pipelines/test_eval.py @@ -82,7 +82,7 @@ def test_summarizer_calculate_metrics(document_store_with_docs: ElasticsearchDoc assert metrics["Summarizer"]["ndcg"] == pytest.approx(0.9461, 1e-4) -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) @pytest.mark.parametrize("batch_size", [None, 20]) def test_add_eval_data(document_store, batch_size): # add eval data (SQUAD format) @@ -129,7 +129,7 @@ def test_add_eval_data(document_store, batch_size): assert doc.content[start:end] == "France" -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) @pytest.mark.parametrize("reader", ["farm"], indirect=True) @pytest.mark.parametrize("use_confidence_scores", [True, False]) def test_eval_reader(reader, document_store, use_confidence_scores): @@ -222,7 +222,7 @@ def test_eval_pipeline(document_store, reader, retriever): assert eval_reader.top_k_em == eval_reader_vanila.top_k_em -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) def test_eval_data_split_word(document_store): # splitting by word preprocessor = PreProcessor( @@ -247,7 +247,7 @@ def test_eval_data_split_word(document_store): assert len(set(labels[0].document_ids)) == 2 -@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus1"], indirect=True) +@pytest.mark.parametrize("document_store", ["elasticsearch", "faiss", "memory", "milvus"], indirect=True) def test_eval_data_split_passage(document_store): # splitting by passage preprocessor = PreProcessor( diff --git a/test/pipelines/test_standard_pipelines.py b/test/pipelines/test_standard_pipelines.py index 409633ac69..880980afe2 100644 --- a/test/pipelines/test_standard_pipelines.py +++ b/test/pipelines/test_standard_pipelines.py @@ -23,7 +23,7 @@ @pytest.mark.parametrize( "retriever,document_store", - [("embedding", "memory"), ("embedding", "faiss"), ("embedding", "milvus1"), ("embedding", "elasticsearch")], + [("embedding", "memory"), ("embedding", "faiss"), ("embedding", "milvus"), ("embedding", "elasticsearch")], indirect=True, ) def test_faq_pipeline(retriever, document_store): @@ -76,7 +76,7 @@ def test_faq_pipeline_batch(retriever, document_store): @pytest.mark.parametrize("retriever", ["embedding"], indirect=True) @pytest.mark.parametrize( - "document_store", ["elasticsearch", "faiss", "memory", "milvus1", "milvus", "weaviate", "pinecone"], indirect=True + "document_store", ["elasticsearch", "faiss", "memory", "milvus", "weaviate", "pinecone"], indirect=True ) def test_document_search_pipeline(retriever, document_store): documents = [ @@ -169,7 +169,7 @@ def test_documentsearch_document_store_authentication(retriever_with_docs, docum @pytest.mark.parametrize( "retriever,document_store", - [("embedding", "faiss"), ("embedding", "milvus1"), ("embedding", "elasticsearch")], + [("embedding", "faiss"), ("embedding", "milvus"), ("embedding", "elasticsearch")], indirect=True, ) def test_most_similar_documents_pipeline(retriever, document_store): @@ -201,7 +201,7 @@ def test_most_similar_documents_pipeline(retriever, document_store): @pytest.mark.parametrize( - "retriever,document_store", [("embedding", "milvus1"), ("embedding", "elasticsearch")], indirect=True + "retriever,document_store", [("embedding", "milvus"), ("embedding", "elasticsearch")], indirect=True ) def test_most_similar_documents_pipeline_with_filters(retriever, document_store): documents = [