diff --git a/.vscode/markdown.code-snippets b/.vscode/markdown.code-snippets index d9b5255ec..96fcdccea 100644 --- a/.vscode/markdown.code-snippets +++ b/.vscode/markdown.code-snippets @@ -143,94 +143,6 @@ "description": "Adds the tabs section for code examples" }, - "Quiz variable": { - "prefix": "docusaurus-academy-quiz-def", - "scope": "markdown", - "body": [ - "", - "", - "import Quiz from '/src/components/Academy/quiz.js'", - "const ${varName} = [{", - " questionText: '${questionText}',", - " answerOptions: [", - " {", - " answerText: '${answerOne}',", - " isCorrect: false,", - " feedback: '${feedbackOne}',", - " },", - " {", - " answerText: '${answerTwo}',", - " isCorrect: false,", - " feedback: '${feedbackTwo}',", - " },", - " {", - " answerText: '${answerThree}',", - " isCorrect: false,", - " feedback: '${feedbackThree}',", - " },", - " ]", - "}];" - ] - }, - "Academy-Top-lvl-head": { - "prefix": "docusaurus-academy-heading", - "scope": "markdown", - "body": [ - "##   Top level head" - ] - }, - "Academy-Theory-subheading": { - "prefix": "docusaurus-academy-theory-subheading", - "scope": "markdown", - "body": [ - "###   Theory subhead" - ] - }, - "Academy-Practical-subheading": { - "prefix": "docusaurus-academy-practical-subheading", - "scope": "markdown", - "body": [ - "###   Practical subhead" - ] - }, - "Academy-exercise-admonition": { - "prefix": "docusaurus-academy-exercise-admonition", - "scope": "markdown", - "body": [ - ":::note Exercise", - "Try out the above query again, with these changes.", - "- xxx", - "- xxx", - ":::" - ] - }, - "Academy-Python-GQL-code": { - "prefix": "docusaurus-academy-python-tabs-code", - "scope": "markdown", - "body": [ - "import Tabs from '@theme/Tabs';", - "import TabItem from '@theme/TabItem';", - "", - "", - "", - "", - "```python", - "", - "```", - "", - "", - "", - "", - "```graphql", - "", - "```", - "", - "", - "" - ], - "description": "Adds the tabs section for Academy python/graphql code examples" - }, - "Related-Pages": { "prefix": "docusaurus-admonition-related-pages-info", "scope": "markdown", diff --git a/docs/academy/_customization/index.md b/docs/academy/_customization/index.md deleted file mode 100644 index 5d870fa4c..000000000 --- a/docs/academy/_customization/index.md +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: 2. Customizing Weaviate -sidebar_position: 200 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -TBC - -## Learning objectives - - - -## Units - - \ No newline at end of file diff --git a/docs/academy/_snippets/academy.clients.graphql.raw.mdx b/docs/academy/_snippets/academy.clients.graphql.raw.mdx deleted file mode 100644 index 1d7be890d..000000000 --- a/docs/academy/_snippets/academy.clients.graphql.raw.mdx +++ /dev/null @@ -1,43 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -import weaviate - -client = weaviate.Client("https://WEAVIATE_INSTANCE_URL") # Replace WEAVIATE_INSTANCE_URL with your instance URL - -query = ''' -{ - Get { - WikiArticle { - title - wiki_summary - } - } -} -''' - -result = client.query.raw(query) - -print(result) -``` - - - - -```go -TBC -``` - - - - -```java -TBC -``` - - - diff --git a/docs/academy/_snippets/example_schema.mdx b/docs/academy/_snippets/example_schema.mdx deleted file mode 100644 index 26dd0f880..000000000 --- a/docs/academy/_snippets/example_schema.mdx +++ /dev/null @@ -1,83 +0,0 @@ -
- - Example schema - -
- -```json -{ - "classes": [ - { - "class": "Category", - "description": "A Jeopardy! category", - ... - "properties": [ - { - "dataType": [ - "text" - ], - "description": "The title of the category", - "name": "title", - "tokenization": "word" - } - ], - ... - "vectorizer": "text2vec-openai" - }, - { - "class": "Question", - ... - "properties": [ - { - "dataType": [ - "text" - ], - "description": "Question asked to the contestant", - ... - "name": "question", - "tokenization": "word" - }, - { - "dataType": [ - "text" - ], - "description": "Answer provided by the contestant", - ... - "name": "answer", - "tokenization": "word" - }, - { - "dataType": [ - "int" - ], - "description": "Points that the question was worth", - ... - "name": "points" - }, - { - "dataType": [ - "text" - ], - "description": "Jeopardy round", - ... - "name": "round" - }, - { - "dataType": [ - "Category" - ], - "description": "The category of the question", - ... - "name": "hasCategory" - } - ... - ], - ... - "vectorizer": "text2vec-openai" - } - ] -} -``` - -
-
\ No newline at end of file diff --git a/docs/academy/_snippets/preview.mdx b/docs/academy/_snippets/preview.mdx deleted file mode 100644 index f7dcdb5bc..000000000 --- a/docs/academy/_snippets/preview.mdx +++ /dev/null @@ -1,5 +0,0 @@ -:::note Preview unit -This is a preview version of this unit. -So some sections are not yet complete - such as videos and quiz questions. -Please check back later for the full version, and in the meantime, feel free to provide any feedback through the comments below. -::: \ No newline at end of file diff --git a/docs/academy/deployment/_category_.json b/docs/academy/deployment/_category_.json deleted file mode 100644 index ae3b74b9a..000000000 --- a/docs/academy/deployment/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "Deployment", - "position": 50 -} \ No newline at end of file diff --git a/docs/academy/deployment/k8s/10_kubernetes_basics.mdx b/docs/academy/deployment/k8s/10_kubernetes_basics.mdx deleted file mode 100644 index 4fd96a922..000000000 --- a/docs/academy/deployment/k8s/10_kubernetes_basics.mdx +++ /dev/null @@ -1,122 +0,0 @@ ---- -title: Create a Kubernetes cluster ---- - -**Kubernetes** is an open-source *container orchestration* platform. It means that you can use Kubernetes to deploy, scale, and manage [containerized](https://www.docker.com/resources/what-container/) applications. - -In production, Kubernetes is used to manage a cluster of nodes, such as those located on cloud providers like AWS, Google Cloud, or Azure, or on your own on-premises infrastructure. - -Here, you learn how to do this using **Minikube**, a handy tool that allows you to run a Kubernetes cluster on your local machine for development and testing purposes. - -:::info What if I have a production Kubernetes cluster? -If you are looking to deploy Weaviate on a production Kubernetes cluster, the steps are similar. You will just need to replace Minikube with your production Kubernetes cluster. - -As there are many variations in Kubernetes deployments, we recommend you follow the deployment instructions provided by your cloud provider or Kubernetes distribution for this purpose. -::: - -## Prerequisites - -For this tutorial, you will need the following tools: - -- **Minikube**: a tool that runs a Kubernetes cluster on your local machine. It is used to test Kubernetes deployments locally. -- **kubectl**: the Kubernetes command-line tool. It is used to interact with the Kubernetes cluster. -- **Docker**: a platform that allows you to develop, ship, and run applications in containers. We install Docker as the Minikube [*driver*](https://minikube.sigs.k8s.io/docs/drivers/) to virtualize the Kubernetes cluster on your local machine. Your Minikube may use a different driver, but Docker is the most common one. -- **Helm**: a package manager for Kubernetes. It is used to install and manage applications on Kubernetes. - -### Installation - -If you don't have any of these tools installed, you can follow the installation instructions for each, by following the links below: - -- [Minikube](https://minikube.sigs.k8s.io/docs/start/) -- [kubectl](https://kubernetes.io/docs/tasks/tools/#kubectl) -- [Docker](https://docs.docker.com/get-docker/) -- [helm](https://helm.sh/docs/intro/install/) - -Once you've done so, confirm installation by running the following commands: - -```bash -minikube version -kubectl version --client -docker --version -helm version -``` - -You should see an output like: - -```bash -minikube version: v1.32.0 -commit: 8220a6eb95f0a4d75f7f2d7b14cef975f050512d -Client Version: v1.28.2 -Kustomize Version: v5.0.4-0.20230601165947-6ce0bf390ce3 -Docker version 24.0.7, build afdd53b -version.BuildInfo{Version:"v3.12.2", GitCommit:"1e210a2c8cc5117d1055bfaa5d40f51bbc2e345e", GitTreeState:"clean", GoVersion:"go1.20.6"} -``` - -If you see similar output, you are ready to proceed. - -Note that the versions may differ from the ones shown above. Please make sure you have the latest versions installed. - -## Run Minikube - -You are now ready to start Minikube. Run the following command: - -```bash -minikube start -``` - -Then, you might see an output like this: - -```bash -😄 minikube v1.32.0 on Darwin 14.4.1 (arm64) -✨ Automatically selected the docker driver. Other choices: vmware, ssh -📌 Using Docker Desktop driver with root privileges -👍 Starting control plane node minikube in cluster minikube -🚜 Pulling base image ... -💾 Downloading Kubernetes v1.28.3 preload ... - > preloaded-images-k8s-v18-v1...: 341.16 MiB / 341.16 MiB 100.00% 42.00 M - > gcr.io/k8s-minikube/kicbase...: 410.56 MiB / 410.58 MiB 100.00% 42.28 M -🔥 Creating docker container (CPUs=2, Memory=8100MB) ... -🐳 Preparing Kubernetes v1.28.3 on Docker 24.0.7 ... - ▪ Generating certificates and keys ... - ▪ Booting up control plane ... - ▪ Configuring RBAC rules ... -🔗 Configuring bridge CNI (Container Networking Interface) ... -🔎 Verifying Kubernetes components... - ▪ Using image gcr.io/k8s-minikube/storage-provisioner:v5 -🌟 Enabled addons: storage-provisioner, default-storageclass -🏄 Done! kubectl is now configured to use "minikube" cluster and "default" namespace by default -``` - -Great! You have started a Kubernetes cluster using Minikube. Note the last comment that says `kubectl is now configured to use "minikube" cluster and "default" namespace by default`. So, you can now use `kubectl` to interact with the Kubernetes cluster. - -### Verify the cluster - -If you run the following command: - -```bash -kubectl get pods -A -``` - -This will show you the pods running in the cluster. You should see something like this: - -```bash -NAMESPACE NAME READY STATUS RESTARTS AGE -kube-system coredns-5dd5756b68-qhfch 1/1 Running 0 26s -kube-system etcd-minikube 1/1 Running 0 42s -kube-system kube-apiserver-minikube 1/1 Running 0 40s -kube-system kube-controller-manager-minikube 1/1 Running 0 42s -kube-system kube-proxy-xwdgf 1/1 Running 0 26s -kube-system kube-scheduler-minikube 1/1 Running 0 40s -kube-system storage-provisioner 1/1 Running 0 39s -``` - -If your output is similar to the one above, then congratulations! You have successfully spun up a Kubernetes cluster on your local machine using Minikube. - -Next, you will learn how to deploy Weaviate to the Kubernetes cluster using Helm. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/deployment/k8s/30_setup_weaviate.mdx b/docs/academy/deployment/k8s/30_setup_weaviate.mdx deleted file mode 100644 index 08ff99b89..000000000 --- a/docs/academy/deployment/k8s/30_setup_weaviate.mdx +++ /dev/null @@ -1,149 +0,0 @@ ---- -title: Run Weaviate on Kubernetes ---- - -Now that you have a Kubernetes cluster, you can deploy Weaviate on it. This section will show you how to deploy Weaviate on Kubernetes using Helm. - -## Helm chart - -As mentioned earlier, **Helm** is a package manager for Kubernetes. It uses a packaging format called **charts**. - -A **Helm chart** is a collection of files that describe a set of Kubernetes resources. It is the equivalent of a **package** in other package managers, such as pip or npm. - -Weaviate provides a [Helm chart](https://github.com/weaviate/weaviate-helm) that you can use to deploy Weaviate on Kubernetes. So, we will use this Helm chart to deploy Weaviate on your Kubernetes cluster. - -## Deployment - -First, add the [Weaviate Helm repository](https://weaviate.github.io/weaviate-helm) to your Helm installation. This will make the Weaviate Helm chart available to you. - -```bash -helm repo add weaviate https://weaviate.github.io/weaviate-helm -``` - -:::tip Tip: Update the Helm repositories - -If you have previously added the `weaviate` Helm repository, run this to update it to the latest version. - -```bash -helm repo update weaviate -``` - -You should periodically [update the Helm repositories](https://helm.sh/docs/helm/helm_repo_update/) to ensure that you have the latest information about available charts. - -::: - -Next, generate a configuration file (`values.yaml`) for the Weaviate Helm chart. - -```bash -helm show values weaviate/weaviate > values.yaml -``` - -This command fetches the default values for the Weaviate Helm chart and saves them to a file named `values.yaml`. You can now edit this file to customize the deployment configuration. - -There are many settings you can configure here. It may be beneficial to explore them in the `values.yaml` file, and review the relevant settings through the in-line comments, or in the [official Weaviate documentation](/weaviate/index.mdx) if you are curious. - -For now, let's configure a couple of important settings, before deploying Weaviate. - -## Configuration - -Before we go further, let's update the configuration file to: - -- Enable the gRPC service -- Enable Cohere integrations - -Open the `values.yaml` file in a text editor, and update the following sections: - -#### Enable the gRPC service - -:::info Default settings -From helm chart version `17.0.0`, the gRPC service is enabled by default. If using older versions, you must enable it to use gRPC. -::: - -Check that the service's `enabled` field is set to `true` and the `type` field is set to `LoadBalancer`. This will expose the gRPC service, which will allow you to access it from outside the Kubernetes cluster so you can make of the [gRPC API](https://weaviate.io/blog/grpc-performance-improvements). - -```yaml -grpcService: - enabled: true # ⬅️ Make sure this is set to true - name: weaviate-grpc - ports: - - name: grpc - protocol: TCP - port: 50051 - type: LoadBalancer # ⬅️ Set this to LoadBalancer (from NodePort) for this example -``` - -#### Enable Cohere integrations - -```yaml - text2vec-cohere: - - enabled: true # ⬅️ Make sure this is set to true - - # ... settings not shown ... - generative-cohere: - - enabled: true # ⬅️ Make sure this is set to true -``` - -Save the file after making these changes. You are now ready to deploy Weaviate on your Kubernetes cluster. - -## Run Weaviate - -Make sure your Kubernetes cluster is up and running (e.g. with `minikube start`), and you have configured `kubectl` to access it. - -Let's first create a namespace for Weaviate: - -```bash -kubectl create namespace weaviate -``` - -This will let us deploy Weaviate in a [separate namespace](https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/#when-to-use-multiple-namespaces). This is not mandatory, but we will do it here as it is good practice to allow better organization of resources. - -Then, run the following command: - -```bash -helm upgrade --install \ - "weaviate" \ - weaviate/weaviate \ - --namespace "weaviate" \ - --values ./values.yaml -``` - -This command will deploy Weaviate in the `weaviate` namespace of your Kubernetes cluster using the configuration specified in the `values.yaml` file. - -Now, if you run this: - -```bash -kubectl get pods -n weaviate -``` - -You should see the Weaviate pods running in the `weaviate` namespace. - -Note that it may take a little bit of time for the pods to start up. You can check the status of the pods by running the `kubectl get pods -n weaviate` command multiple times. - -```bash -❯ kubectl get pods -n weaviate -NAME READY STATUS RESTARTS AGE -weaviate-0 0/1 Pending 0 15s - -❯ kubectl get pods -n weaviate -NAME READY STATUS RESTARTS AGE -weaviate-0 1/1 Running 0 59s -``` - -Note how here, the `weaviate-0` pod went from `Pending` to `Running`. - -Congratulations! You have successfully deployed Weaviate on your local Kubernetes cluster. Next, let's confirm some basic interactions with Weaviate. - -:::tip Upgrading to `1.25` or higher - -To upgrade to `1.25` or higher from a pre-`1.25` version, you must delete the deployed `StatefulSet`, update the helm chart to version `17.0.0` or higher, and re-deploy Weaviate. - -See the [1.25 migration guide for Kubernetes](/deploy/migration/weaviate-1-25.md) for more details. -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/deployment/k8s/50_access_weaviate.mdx b/docs/academy/deployment/k8s/50_access_weaviate.mdx deleted file mode 100644 index f9bcb1797..000000000 --- a/docs/academy/deployment/k8s/50_access_weaviate.mdx +++ /dev/null @@ -1,178 +0,0 @@ ---- -title: Access & Configure Weaviate ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/connect.py'; - -We have now spun up a Weaviate instance in our Kubernetes cluster. So what's next? In this section, we will look at how to access the Weaviate service, and how to configure it to suit your needs. - -## Access Weaviate - -Although our Weaviate service is happily running, it is not yet accessible from the outside world. This is because we have not exposed the service to the outside world. Let's do that now. - -### Expose the services - -Run the following command: - -```bash -minikube tunnel -``` - -You will recall that we configured the `weaviate` service as a `LoadBalancer` type in our Helm chart. So, when we run `minikube tunnel`, it will expose the service to the outside world - or at least, to our local machine. - -You will see a message like: - -```bash -✅ Tunnel successfully started - -📌 NOTE: Please do not close this terminal as this process must stay alive for the tunnel to be accessible ... - -❗ The service/ingress weaviate requires privileged ports to be exposed: [80] -🔑 sudo permission will be asked for it. -🏃 Starting tunnel for service weaviate. -🏃 Starting tunnel for service weaviate-grpc. -``` - -At this point you will be asked for your password. Enter it and the tunnel will be established. Note that closing the terminal or stopping the process will close the tunnel, making the services inaccessible again. - -:::info About `minikube tunnel` -[`minikube tunnel`](https://minikube.sigs.k8s.io/docs/handbook/accessing/#using-minikube-tunnel) creates a route between your local machine and the Minikube cluster. This allows services within your Minikube cluster that are exposed as LoadBalancer to be accessible on your local machine for development. - -We suggest you only run the tunnel command when you need to access the service from your local machine. When you are done, you can stop the tunnel by pressing `Ctrl+C`. -::: - -### Confirm access - -Now, if you run: - -```bash -kubectl get svc weaviate -n weaviate -``` - -You will see the external IP address of the service. For example: - -```bash -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -weaviate LoadBalancer 10.110.44.231 127.0.0.1 80:31230/TCP 61m -``` - -Navigate to `http://:80/v1` in your browser (typically `http://127.0.0.1:80/v1`). You should see the Weaviate REST root endpoint, with links to the various endpoints available in Weaviate. - -Now, you might also recall that we've opened up the gRPC service in our Kubernetes configuration. This service is available on port 50051. You can confirm this by running: - -```bash -kubectl get svc weaviate-grpc -n weaviate -``` - -Which will show: - -```bash -NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE -weaviate-grpc LoadBalancer 10.109.237.21 127.0.0.1 50051:32150/TCP 90m -``` - -
- - Another way to confirm access to the gRPC service - - -If you have netcat installed, you can also try: - -```bash -nc -zv 127.0.0.1 50051 -``` - -Which will show: - -```bash -Connection to 127.0.0.1 port 50051 [tcp/*] succeeded! -``` - -Note that not all systems have `nc` installed by default. It's okay if you don't have it - the `kubectl get svc` command output is sufficient to confirm access to the gRPC service. - -
- -### Connecting to your cluster - -Depending on your external IP address, you may need to use different strategies to connect to your cluster. - -If the external IP for both services is `127.0.0.1`, connect to Weaviate with: - - - -But the `weaviate` and `weaviate-grpc` services have different external IP addresses, you can connect to Weaviate with: - - - -Where parameters are `WEAVIATE_SVC_EXTERNAL_IP` and `GRPC_SVC_EXTERNAL_IP` are your external IP addresses for the `weaviate` and `weaviate-grpc` services respectively. - -## Configure Weaviate - -One of the best things about Kubernetes is that you can easily configure your services. Weaviate is no exception. You can configure Weaviate by updating the `values.yaml` file in the `weaviate` directory. - -For example, you can enable additional modules such as `text2vec-openai` and `generative-openai` modules by setting them to `true`: - -```yaml - text2vec-openai: - - enabled: true # ⬅️ Set to true - - # ... other settings not shown ... - - generative-openai: - - enabled: true # ⬅️ Set to true -``` - -Or we can set resource limits for the Weaviate pods. Let's set them to utilize 30-50% of a CPU, and 150-300Mi of memory: - -:::note Where to set resource limits -The `values.yaml` file contains multiple instances of `requests` and `limits` for different services, such as for local vectorization models. Make sure to set the `requests` and `limits` for the scale replicas of Weaviate towards the top of the file with no indentation. -::: - -```yaml -# Scale replicas of Weaviate. ... -requests: - cpu: '300m' - memory: '150Mi' -limits: - cpu: '500m' - memory: '300Mi' -``` - -To apply these changes, save the `values.yaml` file and run: - -```bash -helm upgrade --install \ - "weaviate" \ - weaviate/weaviate \ - --namespace "weaviate" \ - --values ./values.yaml -``` - -You will note that this is the same command we used to deploy Weaviate. This command will simply update the Weaviate deployment with the new configuration. - -There are a whole host of other configurations you can set in the `values.yaml` file, such as modifying authentication, authorization, backups, monitoring, resource allocation and so on. Please refer to the in-line documentation in the `values.yaml` file, and the Weaviate documentation for more information. - -Before we go, however, let's take a look at expanding our Weaviate deployment to include more nodes. This can help us to scale our Weaviate deployment to handle more traffic or growth, or to provide redundancy in case of node failure. - -We'll take a look at both in the next section. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/deployment/k8s/70_multi_node.mdx b/docs/academy/deployment/k8s/70_multi_node.mdx deleted file mode 100644 index 9c36088fb..000000000 --- a/docs/academy/deployment/k8s/70_multi_node.mdx +++ /dev/null @@ -1,173 +0,0 @@ ---- -title: Multi-node setup ---- - -Recall that we have deployed Weaviate on a single node in our Kubernetes cluster with Minikube. Now, let's scale it up to a multi-node setup. - -To do this, we will need a Kubernetes cluster with multiple nodes. Then, we can configure Weaviate to make use of these nodes. - -## Add nodes to your Weaviate cluster - -We'll stop the current Weaviate deployment and then deploy a new one with [multiple nodes with Minikube](https://minikube.sigs.k8s.io/docs/tutorials/multi_node/). - -:::note Minikube limitations -Keep in mind that this runs multiple containers on the same host device for learning. In a production environment, you would typically have a managed Kubernetes cluster with multiple, isolated, physical or virtual nodes. -::: - -### Stop the current Weaviate deployment - -First, stop the tunnel by pressing `Ctrl+C` at the terminal where you ran `minikube tunnel`. - -Then, stop Minikube: - -```bash -minikube stop -``` - -Since the minikube cluster exists, we will have to delete it before we can start a new one with multiple nodes: - -:::note Alternatively... -You can also add nodes to an existing Minikube cluster. But we'll delete the current one and start a new one for simplicity. -::: - -```bash -minikube delete -``` - -### Start a multi-node Kubernetes cluster - -To start a multi-node Kubernetes cluster with Minikube, run: - -```bash -minikube start --nodes -``` - -Replace `` with the number of nodes you want in your cluster. In our case, let's spin up a 6-node cluster: - -```bash -minikube start --nodes 6 -``` - -Once that's finished, you can check the status of your nodes by running: - -```bash -kubectl get nodes -``` - -And you should see output like: - -```bash -NAME STATUS ROLES AGE VERSION -minikube Ready control-plane 78s v1.28.3 -minikube-m02 Ready 60s v1.28.3 -minikube-m03 Ready 50s v1.28.3 -minikube-m04 Ready 39s v1.28.3 -minikube-m05 Ready 29s v1.28.3 -minikube-m06 Ready 18s v1.28.3 -``` - -Now let's update the Weaviate deployment to use these nodes. To do this, we'll update the `replicas` field in the `values.yaml` file to match the number of nodes in the cluster. Along with our reduced resource requests and limits, this section should look as follows: - -```yaml -replicas: 6 -updateStrategy: - type: RollingUpdate -resources: {} -requests: - cpu: '300m' - memory: '150Mi' -limits: - cpu: '500m' - memory: '300Mi' -``` - -If we restart weaviate by running: - -```bash -helm upgrade --install \ - "weaviate" \ - weaviate/weaviate \ - --namespace "weaviate" \ - --values ./values.yaml -``` - -Then, you should see Weaviate pods come up on each of the nodes in your cluster. You can check this by running: - -```bash -kubectl get pods -n weaviate -``` - -And you should see output like: - -```bash -NAME READY STATUS RESTARTS AGE -weaviate-0 1/1 Running 0 31s -weaviate-1 0/1 PodInitializing 0 9s -``` - -As pods are created one by one. - -Eventually, you will see something like: - -```bash -NAME READY STATUS RESTARTS AGE -weaviate-0 1/1 Running 0 113s -weaviate-1 1/1 Running 0 91s -weaviate-2 1/1 Running 0 79s -weaviate-3 1/1 Running 0 57s -weaviate-4 1/1 Running 0 35s -weaviate-5 1/1 Running 0 13s -``` - -Open a new terminal and run `minikube tunnel` to expose the Weaviate service to your local machine as before. - -## Utilizing a multi-node setup - -There are two main ways to utilize a multi-node setup, namely **replication** and **horizontal scaling**. Let's take a look at each of these. - -### Replication - -Replication is the process of creating multiple copies of the same data across multiple nodes. - -This is useful for ensuring data availability and fault tolerance. In a multi-node setup, you can replicate your Weaviate data across multiple nodes to ensure that if one node fails, the data is still available on other nodes. A replicated setup can also help distribute the load across multiple nodes to improve performance, and allow zero-downtime upgrades and maintenance. - -Weaviate can handle replication by setting a replication factor in the database collection definition. This tells Weaviate how many copies of the data to keep across the nodes in the cluster. - -The code examples show how to configure replication. Keep in mind that the specified port may be different for you (e.g. `80`) than what is shown in the code snippet. - -import SchemaReplication from '/_includes/code/schema.things.create.replication.mdx'; - - - -### Database sharding - -On the other hand, sharding can be used to horizontally scale Weaviate. Sharding is the process of splitting a database into smaller parts, called shards, and distributing these shards across multiple nodes. - -So a database that holds 2 million records do not need to store all 2 million records on a single node. Instead, it can split the records into smaller parts and store them on different nodes. This allows the database to scale horizontally by adding more nodes to the cluster. - -### Sharding with replication - -You can use both sharding and replication together to horizontally scale and ensure fault tolerance in your Weaviate setup. - -In our example with 6 nodes, we could have a replication factor of 3 and shard the data across 2 nodes each. This way, we have 3 copies of the data spread across 6 nodes, ensuring fault tolerance and high availability. - -For a production setup, this is a common approach to ensure that your Weaviate setup can handle high loads and remain available even if a node fails. Additionally, you can add more nodes to the cluster as needed to scale your Weaviate setup horizontally, ensuring that it can handle more data and more requests. - -## Clean-up - -That's it for this guide. You've successfully deployed Weaviate on a multi-node Kubernetes cluster and learned about replication and sharding. - -If you would like, you can access Weaviate as described before, and work with it. We include [further resources on the next page](./90_next_steps.mdx) for you to explore. - -When you are finished, you can stop the tunnel by pressing `Ctrl+C` in the terminal where you ran `minikube tunnel`, Then, stop Minikube and delete the cluster: - -```bash -minikube stop -minikube delete -``` - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/deployment/k8s/90_next_steps.mdx b/docs/academy/deployment/k8s/90_next_steps.mdx deleted file mode 100644 index aaa4afa33..000000000 --- a/docs/academy/deployment/k8s/90_next_steps.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: Next steps ---- - -Congratulations. You have successfully deployed Weaviate on your Kubernetes cluster, and scaled it to a multi-node setup. 🎉 - -You can now explore the following resources to learn more about Weaviate and how to use it: - -We have the following beginner courses available for Python: -- [101T Work with: Text data](../../py/starter_text_data/index.md) -- [101V Work with: Your own vectors](../../py/starter_custom_vectors/index.md) -- [101T Work with: Multimodal data](../../py/starter_multimodal_data/index.md) - -And for TypeScript/JavaScript: -- [Introduction to Weaviate with TypeScript](/academy/js/starter_text_data) - -What you could explore, then, is how to adapt these examples to a multi-node case with replication and/or sharding. - -### Related documentation - -If you are interested in the architecture behind replication and sharding, please check out the following resources: -- [Replication](/weaviate/concepts/replication-architecture/index.md) -- [Horizontal scaling](/weaviate/concepts/cluster.md) - -Then, check out the section on [resource planning](/weaviate/concepts/resources.md) to estimate how much resources each node of a particular size might be able to handle, and how to plan your cluster accordingly. - -If you have replication configured, you can apply [tunable consistency](/deploy/configuration/replication.md) concepts to read/write operations as well as queries. - -### Further reading - -- How-to guides - - - The [How-to: Manage collections](/weaviate/manage-collections/index.mdx) and [How-to: Manage objects](/weaviate/manage-objects/index.mdx) guides show how to perform data operations (i.e. create, read, update, delete collections and objects within them).. - - [How-to: search](/weaviate/search/index.mdx): Code examples for all types of search operations. - - [How-to: configure Weaviate](/weaviate/configuration/index.mdx): Guides for configuring Weaviate, such as [PQ](/weaviate/configuration/compression/pq-compression.md) and [BQ](/weaviate/configuration/compression/bq-compression.md) compression, [backups](/deploy/configuration/backups.md) and [replication](/deploy/configuration/replication.md). -- [Concepts guides](/weaviate/concepts/index.md): Guides for understanding Weaviate's architecture and concepts. -- [API reference](/weaviate/api/index.mdx): Detailed information about Weaviate's APIs. - -import CTASocials from '../../py/_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/deployment/k8s/_snippets/connect.py b/docs/academy/deployment/k8s/_snippets/connect.py deleted file mode 100644 index 5dcf82870..000000000 --- a/docs/academy/deployment/k8s/_snippets/connect.py +++ /dev/null @@ -1,28 +0,0 @@ -# START BasicConnect -import weaviate - -client = weaviate.connect_to_local( - port=80, # The default REST port is 8080 - # grpc_port=50051 # Not needed, as the default gRPC port is 50051 -) -# END BasicConnect - -client.close() - -WEAVIATE_SVC_EXTERNAL_IP = "127.0.0.1" -GRPC_SVC_EXTERNAL_IP = "127.0.0.1" - -# START CustomConnect -import weaviate - -client = weaviate.connect_to_custom( - http_host=WEAVIATE_SVC_EXTERNAL_IP, # The external IP of the weaviate service - http_port=80, # The default REST port is 8080 - http_secure=False, # Whether to use https (secure) for the HTTP API connection - grpc_host=GRPC_SVC_EXTERNAL_IP, # The external IP of the weaviate-grpc service - grpc_port=50051, # The default gRPC port is 50051 - grpc_secure=False # Set to True if the gRPC connection is secure -) -# END CustomConnect - -client.close() diff --git a/docs/academy/deployment/k8s/index.md b/docs/academy/deployment/k8s/index.md deleted file mode 100644 index 4c023749a..000000000 --- a/docs/academy/deployment/k8s/index.md +++ /dev/null @@ -1,22 +0,0 @@ ---- -title: "Run Weaviate on Kubernetes" -description: "Learn how to deploy Weaviate with Kubernetes, covering setup, management, and scalability tips." -sidebar_position: 100 ---- - - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Unit overview - -In this short, project-based unit, you will learn how to deploy Weaviate on Kubernetes, a popular container orchestration platform. - -You will learn how to set up a local Kubernetes cluster and deploy a Weaviate instance on Kubernetes. You will also learn how to configure the Weaviate instance on Kubernetes. - -## Learning objectives - - diff --git a/docs/academy/index.mdx b/docs/academy/index.mdx deleted file mode 100644 index d031156a0..000000000 --- a/docs/academy/index.mdx +++ /dev/null @@ -1,20 +0,0 @@ ---- -title: Academy - Home -sidebar_position: 0 ---- - -import Courses from '/src/components/Academy/courses.jsx'; -import Units from '/src/components/Academy/units.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -We've built these courses to help you build amazing things with Weaviate, faster. - ------ -## Courses ------ - - - -import CustomScriptLoader from '/src/components/scriptSwitch'; - - diff --git a/docs/academy/js/10_set_up_typescript.mdx b/docs/academy/js/10_set_up_typescript.mdx deleted file mode 100644 index 961bfe25b..000000000 --- a/docs/academy/js/10_set_up_typescript.mdx +++ /dev/null @@ -1,76 +0,0 @@ ---- -title: Set up Javascript/TypeScript for Weaviate -sidebar_position: 10 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import WeaviateTypescriptImgUrl from './img/js-ts-weaviate.png'; - -Follow this short guide to make sure that you are set up to use Weaviate with the Typescript/Javascript client. - - -Image alt - -## Install Node.js - -### Is Node.js installed? - -Open a terminal window (e.g. bash, zsh, Windows PowerShell, Windows Terminal), and run: - -```shell -node --version # or node -v -``` - -If you have Node.js installed, you should see a response like `v22.3.0`. The minimum version of Node.js supported by the Weaviate Typescript/Javascript library is `v18`. - - -### Install Node.js - -To install, follow the instructions for your system on [nodejs.org](https://nodejs.org/en/download/package-manager). - -Once you have Node.js installed, check the version again to confirm that you have a recommended version installed. - -:::tip Advanced option: `nvm` -Another good way to install Python is to install `nvm`. This will allow you to manage multiple versions of Node.js on your system. You can find instructions on how to install `nvm` [here](https://github.com/nvm-sh/nvm?tab=readme-ov-file#installing-and-updating). -::: - - -### (Optional) Set up Typescript - -To install, follow the instructions for your system on [typescriptlang.org](https://www.typescriptlang.org/download/). Once installed, you can find instruction on how to configure Typescript to work with the Weaviate client documented [here](../../weaviate/client-libraries/typescript/index.mdx). - -Now Typescript is ready to use with the Weaviate client. - -## Install the Weaviate client - -Now, you can install the [Weaviate client library](../../weaviate/client-libraries/index.mdx), which will make it much easier to interact with Weaviate using Typescript. - -In a new folder for your project, install the Weaviate client with: - -```shell -npm install weaviate-client -``` - -### Confirm the installation - -To confirm that the Weaviate client is installed, run the following in your terminal: - -```shell -npm view weaviate-client -``` - -You should see an output like: - -```text -3.1.4. -``` - -Congratulations, you are now set up to use Weaviate with the Weaviate TypeScript/JavaScript client library! - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/_category_.json b/docs/academy/js/_category_.json deleted file mode 100644 index 8e708216c..000000000 --- a/docs/academy/js/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "JavaScript/TypeScript", - "position": 20 -} \ No newline at end of file diff --git a/docs/academy/js/_snippets/cta_socials.mdx b/docs/academy/js/_snippets/cta_socials.mdx deleted file mode 100644 index 76c75bbb0..000000000 --- a/docs/academy/js/_snippets/cta_socials.mdx +++ /dev/null @@ -1,3 +0,0 @@ -## Stay in touch! - -We are constantly improving our documentation, so please keep an eye out for new resources and updates, by signing up for our [newsletter](https://newsletter.weaviate.io/) or following us on social media ([Twitter](https://x.com/weaviate_io), [LinkedIn](https://www.linkedin.com/company/weaviate-io/)). \ No newline at end of file diff --git a/docs/academy/js/_snippets/intro_next_steps.mdx b/docs/academy/js/_snippets/intro_next_steps.mdx deleted file mode 100644 index dd7ab6c1e..000000000 --- a/docs/academy/js/_snippets/intro_next_steps.mdx +++ /dev/null @@ -1,32 +0,0 @@ -Congratulations! You have completed this introductory course on Weaviate. - -Now that you have completed this course, you may be interested in exploring our documentation or the Academy for more advanced courses. - -Some of our more popular resources include: - -### Documentation - -- How-to guides - - The [How-to: Manage collections](/weaviate/manage-collections/index.mdx) and [How-to: Manage objects](/weaviate/manage-objects/index.mdx) guides show how to perform data operations (i.e. create, read, update, delete collections and objects within them).. - - [How-to: search](/weaviate/search/index.mdx): Code examples for all types of search operations. - - [How-to: configure Weaviate](/weaviate/configuration/index.mdx): Guides for configuring Weaviate, such as [PQ](/weaviate/configuration/compression/pq-compression.md) and [BQ](/weaviate/configuration/compression/bq-compression.md) compression, [backups](/deploy/configuration/backups.md) and [replication](/deploy/configuration/replication.md). -- [Concepts guides](/weaviate/concepts/index.md): Guides for understanding Weaviate's architecture and concepts. -- [API reference](/weaviate/api/index.mdx): Detailed information about Weaviate's APIs. - -### Academy - -- [Named vectors](/academy/py/named_vectors): Learn how to use named vectors to flexibly represent data in Weaviate. -- [Which search is right for me?](/academy/py/standalone/which_search): Learn about the different types of searches in Weaviate and when to use them. -- [Chunking](/academy/py/standalone/chunking): Learn how to use chunking to optimize your search for longer documents. - -import CTASocials from './cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/_snippets/intro_next_steps_js.mdx b/docs/academy/js/_snippets/intro_next_steps_js.mdx deleted file mode 100644 index 59f134651..000000000 --- a/docs/academy/js/_snippets/intro_next_steps_js.mdx +++ /dev/null @@ -1,32 +0,0 @@ -Congratulations! You have completed this introductory course on Weaviate. - -Now that you have completed this course, you may be interested in exploring our documentation or the Academy for more advanced courses. - -Some of our more popular resources include: - -### Documentation - -- How-to guides - - The [How-to: Manage collections](/weaviate/manage-collections/index.mdx) and [How-to: Manage objects](/weaviate/manage-objects/index.mdx) guides show how to perform data operations (i.e. create, read, update, delete collections and objects within them).. - - [How-to: search](/weaviate/search/index.mdx): Code examples for all types of search operations. - - [How-to: configure Weaviate](/weaviate/configuration/index.mdx): Guides for configuring Weaviate, such as [PQ](/weaviate/configuration/compression/pq-compression.md) and [BQ](/weaviate/configuration/compression/bq-compression.md) compression, [backups](/deploy/configuration/backups.md) and [replication](/deploy/configuration/replication.md). -- [Concepts guides](/weaviate/concepts/index.md): Guides for understanding Weaviate's architecture and concepts. -- [API reference](/weaviate/api/index.mdx): Detailed information about Weaviate's APIs. - -### Academy - -- [Working with Text](/academy/js/starter_text_data): Learn how to use work with text data in Weaviate. -- [Which search is right for me?](/academy/js/standalone/which-search): Learn about the different types of searches in Weaviate and when to use them. - - -import CTASocials from './cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/img/js-ts-weaviate.png b/docs/academy/js/img/js-ts-weaviate.png deleted file mode 100644 index 164f3d8e5..000000000 Binary files a/docs/academy/js/img/js-ts-weaviate.png and /dev/null differ diff --git a/docs/academy/js/standalone/_want-stack/_snippets/index.ts b/docs/academy/js/standalone/_want-stack/_snippets/index.ts deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/academy/js/standalone/_want-stack/index.md b/docs/academy/js/standalone/_want-stack/index.md deleted file mode 100644 index 35bc08112..000000000 --- a/docs/academy/js/standalone/_want-stack/index.md +++ /dev/null @@ -1,62 +0,0 @@ ---- -title: What is the WANT stack -sidebar_position: 10 ---- - -## What is the WANT - -### What is WANT - -Weaviate, AI, Next.js and Typescript. - -The want stack is collection of technologies that simplify the process of build AI Native Applications. - - -#### Weaviate - -Weaviate is a vector database, it gives you these benefits -- vector search -- dynamic generative model selection (you don't get stuck) - -#### AI - -- this ties heavily into everything else, AI is at the center of AI native, leveraging the flexibility of weaviate and teh vastness of its integrations, you can bring AI to your application from embeddings, to generative models to rerankers or recommenders. Without any tie in. The goal is to give you the most flexibility without the burden on choosing - -#### Next.js (or Nuxt.js) - -This brings modern web development, - - -#### Typescript - -Typescript.. add content - - - -### Building WANT applications - -- Resources -- Starter templates -- Learning - - -[Embed Youtube Course] - -A vector search is also versatile. It can be used to search multiple data modalities (e.g. text, images, audio, etc.), and across multiple languages. - - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/standalone/client-server/10_why-client-server.mdx b/docs/academy/js/standalone/client-server/10_why-client-server.mdx deleted file mode 100644 index d87953f8e..000000000 --- a/docs/academy/js/standalone/client-server/10_why-client-server.mdx +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: What is the client-server application ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import WeaviateTypescriptImgUrl from '/docs/academy/js/standalone/client-server/_img/architecture.jpg'; - - - -### What is a client-server application - -A client-server application is one that runs on a client device while accessing information from a remote server. - -In the context of building applications with Weaviate, this means having a server as an intermediary to handle all interactions with your database as opposed to interacting with your database directly from your client application. - -Image alt - -This section aims to guide you through how to do that as you build applications with the Weaviate Typescript v3 client: [weaviate-client](https://www.npmjs.com/package/weaviate-client). - - - -### Using the weaviate-client in a client-server application - -The v3 client uses gRPC to connect to your Weaviate instance. The client supports Node.js, server-based development. It does not support browser-based web client development. - -Install the client by following [these instructions](../../../../weaviate/client-libraries/typescript/index.mdx#installation). A big benefit of using the new v3 client is the introduction of the gRPC protocol. A faster, more reliable platform to handle interactions with the database at scale. Unfortunately, gRPC does not support browser-based client development. - -Besides the requirements of running the [weaviate-client](../../../../weaviate/client-libraries/typescript/index.mdx), the client-server architecture is reliably more secure than interactions directly from the client. - -Having a client-server approach means you can optimize your use of Weaviate by implementing load balancing, user and session management, middleware and various other optimizations. - -In the next sections we'll look at how to build client-server applications with.. -- [Using backend web frameworks](./20_building-client-server.mdx) -- [Using fullstack web frameworks](./30_fullstack.mdx) - diff --git a/docs/academy/js/standalone/client-server/20_building-client-server.mdx b/docs/academy/js/standalone/client-server/20_building-client-server.mdx deleted file mode 100644 index 3e6917ed1..000000000 --- a/docs/academy/js/standalone/client-server/20_building-client-server.mdx +++ /dev/null @@ -1,133 +0,0 @@ ---- -title: Using Backend Web frameworks ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/20_backend.js'; -import ClientCode from '!!raw-loader!./_snippets/index.html'; -import WeaviateTypescriptImgUrl from '/docs/academy/js/standalone/client-server/_img/backend.jpg'; - - - -This approach involves having two separate tools. One to build your server application; ideally a backend framework and another to build your client application. For this example, we will be using [Express.js](https://expressjs.com/en/starter/hello-world.html) to build a backend server, and [Thunder Client](https://www.thunderclient.com/) to act as a client and make API calls to our backend server. - -Image alt - - -## Building a server - -The server will have a single route that accepts a `searchTerm` as a query parameter. - -### 1. Initialize a Node.js application - -We will use Express to build our server, in a new directory, run the following command to initialize a new project with Node.js - -```bash -npm init -``` -### 2. Install project dependencies - -With our project initialized, install `dotenv` to manage environment variables, `express` to build our server and the `weaviate-client` to manage communication with our Weaviate database. - -```bash -npm install express dotenv weaviate-client -``` - - -### 3. Setup your Weaviate database - -We'll start by creating a free sandbox account on [Weaviate Cloud](https://console.weaviate.cloud/). Follow [this guide](/cloud/manage-clusters/connect) if you have trouble setting up a sandbox project. - - -You will need your Weaviate cluster URL and API key. If you don't already have one, create a new Cohere [API key](https://dashboard.cohere.com/api-keys), we use Cohere as our [embedding model](../using-ml-models/10_embedding.mdx). When done, add all three to your `.env` file. - - - - - - -#### 3.5 Add data to Weaviate - -Follow our recipe on [loading data](https://github.com/weaviate/recipes-ts/blob/main/similarity-search/cohere/load.ts) into Weaviate to import data to your Weaviate database. - -### 4. Connecting to Weaviate - -In `config/weaviate.js`, paste the following code. - - - - - -The code above helps us create a connection to our Weaviate instance hosted on Weaviate Cloud. - -### 5. Create a Search route - -In your project root, create a file called `app.js` and paste the following code in it. - - - - - -With this we can run searches on the `/search` route. We use `nearText()` to run our semantic search. - -#### 6. Run your server -In your terminal run the following command to start your server. - -```bash -node app.js -``` - -Your server should be running on `localhost:3005`. - - - -## Building a client Application - -With our server built, we can now make a call from a client application. We'll create a basic client application with HTML and JavaScript. - -Alternatively, in [Thunder Client](https://www.thunderclient.com/), you can make a call to `http://localhost:3005/search?searchTerm=countries in asia` to query your server. - - -### 1. Create a client application - -In your root folder, create a file called `index.html` and paste the following code in it. - - - - - - -### 2. Run your client application - -In your root folder, run the following in your terminal - -```bash -npx http-server -``` - - -This client application makes a call to the express server you built, displaying results. - - diff --git a/docs/academy/js/standalone/client-server/30_fullstack.mdx b/docs/academy/js/standalone/client-server/30_fullstack.mdx deleted file mode 100644 index ef99b4785..000000000 --- a/docs/academy/js/standalone/client-server/30_fullstack.mdx +++ /dev/null @@ -1,135 +0,0 @@ ---- -title: Using Fullstack Web frameworks ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import WeaviateTypescriptImgUrl from '/docs/academy/js/standalone/client-server/_img/fullstack.jpg'; -import TSCode from '!!raw-loader!./_snippets/20_backend.js'; - -This approach involves having a single tool to build both your server application and client application. In modern web development terms, such a tool is called a fullstack web framework. For this example, we will be using [Next.js](https://nextjs.org/) - -Image alt - -## Building with Next.js - -### 1. Create a Next.js application - -To create a new application with Next.js, run the following command in your terminal. - -```bash -create-next-app –ts –app -``` -### 2. Install project dependencies - -With our project initialized, install the `weaviate-client` to manage communication with our Weaviate database. - -```bash -npm install weaviate-client -``` - -### 3. Setup your Weaviate database - -We'll start by creating a free sandbox account on [Weaviate Cloud](https://console.weaviate.cloud/). Follow [this guide](/cloud/manage-clusters/connect) if you have trouble setting up a sandbox project. - -You will need your Weaviate cluster URL and API key. If you don't already have one, create a new Cohere [API key](https://dashboard.cohere.com/api-keys), we use Cohere as our [embedding model](../using-ml-models/10_embedding.mdx). When done, add all three to your `.env` file. - - - - - -#### 3.5 Add data to Weaviate - -Follow our recipe on [loading data](https://github.com/weaviate/recipes-ts/blob/main/similarity-search/cohere/load.ts) into Weaviate to import data to your Weaviate database. - -### 4. Initialize Weaviate - -Create a file in `utils/weaviate.ts` and paste the following code in it. The code helps us create a connection to our Weaviate instance hosted on Weaviate Cloud. - - -```ts -import weaviate from "weaviate-client"; - -const client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-Cohere-Api-Key': process.env.COHERE_API_KEY as string - } - }, -); -``` - -### 4. Create a Search Server Action - -Next, in `./utils/action.ts`, paste the following code. With this we can run semantic searches with `nearText()` by calling the function `vectorSearch()` in other parts of our application. - - -```ts -"use server"; - -import { connectToDB } from './weaviate.ts' - -export async function vectorSearch(searchTerm: string) { - - const myCollection = client.collections.use('MyCollectionName'); - - const response = await myCollection.query.nearText(searchTerm, { - limit: 8, - returnMetadata: ['distance'], - }) - - return response - } -``` - -### 5. Fetch data from your server in your client application. -In the `./app` folder, paste the following code in `page.tsx`. We run a search on our client and display the results on the webpage. - -```tsx - -import { vectorSearch } from '@/utils/action.ts'; - -export default async function Home() { - const search = "water sports i can win a medal in" - const data = await vectorSearch(search); - - return ( - - -

- { data } -

- - - ) - -} - -``` -#### 6. Run your Fullstack App - -In your terminal, run the following command to start your application. - - -```bash -npm run dev -``` - -Your application should be running on `localhost:3000`. - -## Other frameworks - - -Although only detailing Next.js in guide, you can build with Weaviate using a number of fullstack frameworks including but not limited to [Nuxt](https://nuxt.com/), [Solid](https://www.solidjs.com/) and [Angular](https://angular.dev/) - -We have a list of [starter](https://github.com/topics/weaviate-starter) applications you can play around with as well. - - - - diff --git a/docs/academy/js/standalone/client-server/_img/architecture.jpg b/docs/academy/js/standalone/client-server/_img/architecture.jpg deleted file mode 100644 index 9a8ddb7cf..000000000 Binary files a/docs/academy/js/standalone/client-server/_img/architecture.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/client-server/_img/backend.jpg b/docs/academy/js/standalone/client-server/_img/backend.jpg deleted file mode 100644 index c5fe962ce..000000000 Binary files a/docs/academy/js/standalone/client-server/_img/backend.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/client-server/_img/fullstack.jpg b/docs/academy/js/standalone/client-server/_img/fullstack.jpg deleted file mode 100644 index 417f0d518..000000000 Binary files a/docs/academy/js/standalone/client-server/_img/fullstack.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/client-server/_snippets/20_backend.js b/docs/academy/js/standalone/client-server/_snippets/20_backend.js deleted file mode 100644 index fdac27ebb..000000000 --- a/docs/academy/js/standalone/client-server/_snippets/20_backend.js +++ /dev/null @@ -1,62 +0,0 @@ -// START weaviate.js -import weaviate from 'weaviate-client' -import 'dotenv/config'; - -export const connectToDB = async () => { - try { - const client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY), - headers: { - 'X-Cohere-Api-Key': process.env.COHERE_API_KEY || '', - } - } - ) - console.log(`We are connected! ${await client.isReady()}`); - return client - } catch (error) { - console.error(`Error: ${error.message}`); - process.exit(1); - } -}; - -// END weaviate.js -const dotEnv = ``` -// START .env -COHERE_API_KEY= -WEAVIATE_URL= -WEAVIATE_API_KEY= -// END .env -``` - -// START app.js -import express from 'express'; -import { connectToDB } from './config/weaviate.js'; - -const app = express(); -const port = 3005 - -const client = await connectToDB(); - -app.get('/', async function(req, res, next) { - var searchTerm = req.query.searchTerm; - - const wikipedia = client.collections.use("Wikipedia") - - try { - const response = await wikipedia.query.nearText(searchTerm, { - limit: 3 - }) - - res.send(response.objects) - } catch (error) { - console.error(`Error: ${error.message}`); - } - }) - -app.listen(port, () => { - console.log(`App listening on port ${port}`) -}) - - - -// END app.js diff --git a/docs/academy/js/standalone/client-server/_snippets/30_fullstack.js b/docs/academy/js/standalone/client-server/_snippets/30_fullstack.js deleted file mode 100644 index 2458d472a..000000000 --- a/docs/academy/js/standalone/client-server/_snippets/30_fullstack.js +++ /dev/null @@ -1,67 +0,0 @@ -// START weaviate.js - -import weaviate from 'weaviate-client' -import 'dotenv/config'; - -export const connectToDB = async () => { - try { - const client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY), - headers: { - 'X-Cohere-Api-Key': process.env.COHERE_API_KEY || '', - } - } - ) - console.log(`We are connected! ${await client.isReady()}`); - return client - } catch (error) { - console.error(`Error: ${error.message}`); - process.exit(1); - } -}; - -// END weaviate.js - -// .env -` -COHERE_API_KEY= -WEAVIATE_URL= -WEAVIATE_API_KEY= -` -// END .env - - -// START app.js -import express from 'express'; -import { connectToDB } from './config/weaviate.js'; - -const app = express(); -const port = 3005 - -const client = await connectToDB(); - - - -app.get('/', async function(req, res, next) { - var searchTerm = req.query.searchTerm; - - const wikipedia = client.collections.use("Wikipedia") - - try { - const response = await wikipedia.query.nearText(searchTerm, { - limit: 5 - }) - - res.send(response.objects) - } catch (error) { - console.error(`Error: ${error.message}`); - } - }) - -app.listen(port, () => { - console.log(`App listening on port ${port}`) -}) - - - -// END app.js diff --git a/docs/academy/js/standalone/client-server/_snippets/index.html b/docs/academy/js/standalone/client-server/_snippets/index.html deleted file mode 100644 index 0819a2dfe..000000000 --- a/docs/academy/js/standalone/client-server/_snippets/index.html +++ /dev/null @@ -1,83 +0,0 @@ - - - - - Weaviate Search Client - - - -
-

Semantic Search Results

- -
Loading...
-
-
-
- - - - - \ No newline at end of file diff --git a/docs/academy/js/standalone/client-server/index.md b/docs/academy/js/standalone/client-server/index.md deleted file mode 100644 index 55bc75cc1..000000000 --- a/docs/academy/js/standalone/client-server/index.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: Building client-server Applications ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; - - -## Overview - -When building web applications in JavaScript with Weaviate using the [weaviate-client](https://www.npmjs.com/package/weaviate-client), it is recommended that you employ the client-server architecture. - -This may vary depending what tools you are using to build your web application. - -Fullstack frameworks like Next.js have support for server side development and API creation to communicate with Weaviate. This would happen via REST calls or for Next.js specifically, Server functions. This approach means coupling your client and server applications. - -Backend web frameworks like Express let you create an API to communicate with Weaviate. This API can be consumed via REST calls from your client application. This approach means completely decoupling your client and server applications. - - -### Prerequisites - -- A Node.js environment with `weaviate-client` installed. -- Familiarity with Weaviate's search capabilities. -- Some experience building Modern Web Applications with JavaScript. -- Intermediate coding proficiency (e.g. JavaScript). - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/js/standalone/index.md b/docs/academy/js/standalone/index.md deleted file mode 100644 index 13c1a747d..000000000 --- a/docs/academy/js/standalone/index.md +++ /dev/null @@ -1,16 +0,0 @@ ---- -title: Short units -sidebar_position: 900 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Overview - -This section hosts short, standalone units that you can read independently of any other topics. - -## Units - - diff --git a/docs/academy/js/standalone/using-ml-models/10_embedding.mdx b/docs/academy/js/standalone/using-ml-models/10_embedding.mdx deleted file mode 100644 index e580b44cc..000000000 --- a/docs/academy/js/standalone/using-ml-models/10_embedding.mdx +++ /dev/null @@ -1,140 +0,0 @@ ---- -title: Using Embedding models in Weaviate ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/10_embedding.ts'; -import EmbeddingModelImage from '/docs/academy/js/standalone/using-ml-models/_img/embedding-models.jpg'; -import Unimodal from '/docs/academy/js/standalone/using-ml-models/_img/unimodal.jpg'; -import Multimodal from '/docs/academy/js/standalone/using-ml-models/_img/multimodal.jpg'; - - -## What are Embedding Models - -Embedding models are machine learning models trained to represent information as an array of numbers, frequently referred to as vector embeddings. Vectors or vector embeddings are numeric representations of data that represent certain properties or features. This representation can be used to efficiently search through objects in a vector space. - -Image alt - - -## When to use Embedding Models - -Embeddings are the worker horses behind modern search and Retrieval-Augmented Generation (RAG) applications. They are great for.. - -- **Search:** Results of searches are ranked by the distance from an input query vector. -- **Classification:** Items are classified by what category their vector representation is closest to. -- **Recommendations:** Items with similar vector representations are recommended to users. - - -## Applications of Embedding Models - -Embedding models, like most machine learning models are typically limited to one or more modalities. - -We use modality to describe the type of input or output that a machine learning model can process or interact with to run. Typically, embedding modals fall into two buckets, uni-modal or multimodal. - - -- **Uni-modal Embeddings**: These embeddings represents a single modality in a multi-dimensional vector space. Examples of these are [embed-multilingual-v3.0](https://cohere.com/blog/introducing-embed-v3) a text embedding model by Cohere or [marengo 2.7](https://www.twelvelabs.io/blog/introducing-marengo-2-7) a video embedding models by Twelve Labs. - -Image alt - - -- **Multimodal Embeddings**: These embeddings represent multiple modalities in a multi-dimensional space. Allowing cross modal retrieval and clustering. [CLIP](https://openai.com/index/clip/) is a popular multimodal model that can create embeddings of text, audio and video data. - -Image alt - - - -## Using Embedding Models in Weaviate - -Weaviate takes most of the complexity of generating and managing embeddings away! Weaviate is configured to support many different vectorizer models and vectorizer service providers. It also gives you the option of providing your own vectors. - -In Weaviate, vector embeddings power hybrid and semantic search. - -Lets walk through the process to configure embedding models in Weaviate and make a semantic search. We'll start by creating a free sandbox account on [Weaviate Cloud](https://console.weaviate.cloud/). Follow [this guide](/cloud/manage-clusters/connect) if you have trouble setting up a sandbox project. - -### Step 1: Connect to a Weaviate instance - - - - - -Initialize your connection with Weaviate and add relevant environment variables necessary to access third party embedding models. - -### Step 2: Define a Collection and Embedding Model - - - - - -When creating a collection in Weaviate, we define what embedding model we want to use. In this example we use a text embedding model by Cohere to create vector embeddings our data. This is **embed-multilingual-v3.0** when we use the `text2vecCohere()` module. - - -### Step 3: Importing data - - - - - -Once our collection is created, we import data. It is at import time where we interact with our embedding model. The Weaviate vectorizer sends objects to the embedding model we define during collection creation. At the end of this, we have both our data objects and their corresponding vector representations stored in our Vector Database. Now we can run semantic search queries. - - -### Step 4: Running a Semantic Search - - - - - - -Here we make a query and set `return` as true so we can see the objects' vectors in our response. Read more about [search here](/weaviate/concepts/search). - - - - -## Bonus Learning - -### Vector Representations -Vector representations are the fundamental output of embedding models. They translate complex data (text, images, etc.) into fixed-length arrays of numbers that capture semantic meaning. - -- **Dimensionality**: Typically ranges from 384 to 1536 dimensions, depending on the model. A larger dimensionality usually means more accuracy but also a higher memory footprint for generated vectors. - -- **Format**: Vectors are typically floating point numbers, usually normalized to a specific range. - -### Distance metrics - -[Distance metrics](/weaviate/config-refs/distances) quantify the similarity between vector embeddings. Weaviate uses **[cosine similarity](https://weaviate.io/blog/distance-metrics-in-vector-search#cosine-similarity)** as the default distance metric for semantic similarity. - - - - - -Embedding models vary when it comes to performance and ability, [read through this article](https://weaviate.io/blog/how-to-choose-an-embedding-model) so you have an idea of what to think about decide between various models. - diff --git a/docs/academy/js/standalone/using-ml-models/20_generative.mdx b/docs/academy/js/standalone/using-ml-models/20_generative.mdx deleted file mode 100644 index 33f84237a..000000000 --- a/docs/academy/js/standalone/using-ml-models/20_generative.mdx +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: Using Generative Models in Weaviate ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/20_generative.ts'; -import WeaviateTypescriptImgUrl from '/docs/academy/js/standalone/using-ml-models/_img/generative.jpg'; -import Unimodal from '/docs/academy/js/standalone/using-ml-models/_img/unimodal-gen.jpg'; -import Multimodal from '/docs/academy/js/standalone/using-ml-models/_img/multimodal-gen.jpg'; - - -## What are Generative Models - -Generative models are machine learning models that when prompted, can generate original data guided by instructions in the prompt i.e. text, images, and other forms. This original data is derived from data it was trained on but does not mimic it like for like. - -Image alt - -Generative Models encompass so many types of models, we will specifically focus on large language models (LLMs). - -## When to use Generative Models - -Generative models are stars in the limelight of retrieval augmented generation (RAG) and agentic workflows. They are great for... - -- **Translation:** Models can perform zero-shot translate text from one language to another with extremely high accuracy. -- **Code Generation:** Models can take high-level instructions and turn them into functional custom code. -- **Image Generation:** Models can consistently generate high quality images from text instructions in a prompt. - - -## Applications of Generative Models - - -Large Language Models (LLMs), like [Claude](https://www.anthropic.com/claude) family by Anthropic or [Gemini](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/inference) by Google are specialized types of generative models focused on text data. These models, like most machine learning models are typically limited to one or more modalities. - -We use modality to describe the type of input or output that a machine learning model can process or interact with to run. Typically, generative modals fall into two buckets, uni-modal or multimodal. - - -- **Uni-modal Generation:** In the context on LLMs, uni-modal generation defines a models ability to generate content and receive instructions in a single modality, this modality is usually text. - -Image alt - - - -- **Multimodal Generation:** In the context on LLMs, multimodal generation defines a models ability to generate and receive instructions in multiple modalities. This can range from text input to generation or even image input to audio generation. - -Image alt - - -### Using Generative Models in Weaviate - -Weaviate is configured to support many generative models and generative model providers. You can even plug in your own generative model too depending on where in the Weaviate workflow you need generative capabilities. - - -In Weaviate, generative models power RAG (generative search). Lets walk through what its like to use generative models in Weaviate. We'll start by creating a free sandbox account on [Weaviate Cloud](https://console.weaviate.cloud/). Follow [this guide](/cloud/manage-clusters/connect) if you have trouble setting up a sandbox project. - -## Step 1: Connect to a Weaviate instance - - - - - -Initialize your connection with Weaviate and add relevant environment variables necessary to access third party generative models. - -## Step 2: Define a Collection and Generative Model - - - - - -When creating a collection in Weaviate, we define what generative model we want to use. In this example we use a text generation model by Cohere to generate new data. This is **command-r** by default. - -## Step 3: Importing data - - - - - -Once our collection is created, we import data. It is at import time where we interact with our embedding model. The Weaviate vectorizer sends objects to the embedding model we define during collection creation. At the end of this, we have both our data objects and their corresponding vector representations stored in our Vector Database. Now we can run semantic search queries and with a generative model defined, RAG! - -## Step 4: Making a Single Task Generative Search - - - - - -Here we use a `singlePrompt` to make `n` requests to the language model where `n` is the number of responses we get from our semantic search. We use `limit` to strictly define the number of responses we get. We can place responses from each response into our prompt with this format `{ answer }` i.e we want the answer property from our search response to be translated to French. - -## Step 5: Making a grouped Generative Search - - - - - - -Here we use the `groupedTask` prompt format to group all the response from our search and send them alongside our prompt as context for what ever we are requesting. You can see with `groupedProperties` we only pass the answer property from all the results we get as context to the large language model, giving us control of what information will inform the models output. - - - -## Bonus Learning -### Prompt engineering & Output control -Prompt engineering is the science of refining inputs or "prompts" to AI models to achieve desired or more effective outputs. It involves.. - -- **Clear Instructions:** Being specific and explicit in your requests helps the AI understand exactly what you need. Instead of "analyze this," try "provide a detailed analysis of the key themes and supporting evidence." - - -### Context windows -The context window represents how much information an AI model can "see" and process at once. Think of it as the model's working memory for each conversation. - -- **Token Limits:** Context windows are measured in tokens (roughly 3/4 of a word in English). Different models have different limits - from a few thousand to hundreds of thousands of tokens. - -Both managing context windows and prompt engineering are a great way to begin refining your RAG implementation. - -Generative models vary when it comes to performance and ability. [Browse our integrations page](/weaviate/model-providers) to have a better idea of what options you can use in Weaviate. - diff --git a/docs/academy/js/standalone/using-ml-models/_img/embedding-models.jpg b/docs/academy/js/standalone/using-ml-models/_img/embedding-models.jpg deleted file mode 100644 index d81264c35..000000000 Binary files a/docs/academy/js/standalone/using-ml-models/_img/embedding-models.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/using-ml-models/_img/generative.jpg b/docs/academy/js/standalone/using-ml-models/_img/generative.jpg deleted file mode 100644 index d3696882e..000000000 Binary files a/docs/academy/js/standalone/using-ml-models/_img/generative.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/using-ml-models/_img/multimodal-gen.jpg b/docs/academy/js/standalone/using-ml-models/_img/multimodal-gen.jpg deleted file mode 100644 index f23f1917d..000000000 Binary files a/docs/academy/js/standalone/using-ml-models/_img/multimodal-gen.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/using-ml-models/_img/multimodal.jpg b/docs/academy/js/standalone/using-ml-models/_img/multimodal.jpg deleted file mode 100644 index c9c0af6b2..000000000 Binary files a/docs/academy/js/standalone/using-ml-models/_img/multimodal.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/using-ml-models/_img/unimodal-gen.jpg b/docs/academy/js/standalone/using-ml-models/_img/unimodal-gen.jpg deleted file mode 100644 index 11626ef27..000000000 Binary files a/docs/academy/js/standalone/using-ml-models/_img/unimodal-gen.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/using-ml-models/_img/unimodal.jpg b/docs/academy/js/standalone/using-ml-models/_img/unimodal.jpg deleted file mode 100644 index 195c9dc49..000000000 Binary files a/docs/academy/js/standalone/using-ml-models/_img/unimodal.jpg and /dev/null differ diff --git a/docs/academy/js/standalone/using-ml-models/_snippets/10_embedding.ts b/docs/academy/js/standalone/using-ml-models/_snippets/10_embedding.ts deleted file mode 100644 index eb6bd425a..000000000 --- a/docs/academy/js/standalone/using-ml-models/_snippets/10_embedding.ts +++ /dev/null @@ -1,71 +0,0 @@ -// START Connect -import weaviate, { WeaviateClient, configure } from 'weaviate-client' - -// END Connect -import 'dotenv/config' - -// START Connect -const weaviateURL = process.env.WEAVIATE_URL as string -const weaviateKey = process.env.WEAVIATE_API_KEY as string -const cohereKey = process.env.COHERE_API_KEY as string - -// Connect to your Weaviate instance -const client: WeaviateClient = await weaviate.connectToWeaviateCloud(weaviateURL,{ - authCredentials: new weaviate.ApiKey(weaviateKey), - headers: { - 'X-Cohere-Api-Key': cohereKey, // Replace with your inference API key - } - } -) -// END Connect - // Delete the "JeopardyQuestion" collection if it exists - await client.collections.delete('JeopardyQuestion'); - - if (await client.collections.exists('JeopardyQuestion') == false) { - - // Create a collection with both a vectorizer and generative model -// START Collection -await client.collections.create({ - name: 'JeopardyQuestion', - properties: [ - { name: 'Category', dataType: configure.dataType.TEXT }, - { name: 'Question', dataType: configure.dataType.TEXT }, - { name: 'Answer', dataType: configure.dataType.TEXT} - ], - // Define your Cohere vectorizer and generative model - vectorizers: weaviate.configure.vectors.text2VecCohere({ - sourceProperties: ["Question", "Answer"] - }), -}); -// END Collection - - } - - try { -// START Importing -let jeopardyCollection = client.collections.use('JeopardyQuestion'); -// Download data to import into the "JeopardyQuestion" collection -const url = 'https://raw.githubusercontent.com/weaviate/weaviate-examples/main/jeopardy_small_dataset/jeopardy_tiny.json' -const response = await fetch(url); -const jeopardyQuestions = await response.json(); - -// Bulk insert downloaded data into the "JeopardyQuestion" collection -await jeopardyCollection.data.insertMany(jeopardyQuestions.data) -// END Importing - console.log('Data Imported'); - } catch (e) { - console.error(e); - } -// START Search -const jeopardyCollection = client.collections.use('JeopardyQuestion'); - -const searchResults = await jeopardyCollection.query.nearText(['question about animals'], { - limit: 3, - returnMetadata: ['distance'], // Return the distance of results from the query vector - includeVector: false // Change to true to include objects' vectors in your response -}) - -console.log("Near Text objects for:", JSON.stringify(searchResults, null, 2)); -// END Search - - diff --git a/docs/academy/js/standalone/using-ml-models/_snippets/20_generative.ts b/docs/academy/js/standalone/using-ml-models/_snippets/20_generative.ts deleted file mode 100644 index ee593a7ac..000000000 --- a/docs/academy/js/standalone/using-ml-models/_snippets/20_generative.ts +++ /dev/null @@ -1,81 +0,0 @@ -// START Connect -import weaviate, { WeaviateClient, configure } from 'weaviate-client' - -// END Connect - -import 'dotenv/config' - -// START Connect -const weaviateURL = process.env.WEAVIATE_URL as string -const weaviateKey = process.env.WEAVIATE_API_KEY as string -const cohereKey = process.env.COHERE_API_KEY as string - -// Connect to your Weaviate instance -const client: WeaviateClient = await weaviate.connectToWeaviateCloud(weaviateURL,{ - authCredentials: new weaviate.ApiKey(weaviateKey), - headers: { - 'X-Cohere-Api-Key': cohereKey, // Replace with your inference API key - } - } -) -// END Connect - - // Delete the "JeopardyQuestion" collection if it exists - await client.collections.delete('JeopardyQuestion'); - - if (await client.collections.exists('JeopardyQuestion') == false) { -// START Collection -await client.collections.create({ - name: 'JeopardyQuestion', - properties: [ - { name: 'Category', dataType: configure.dataType.TEXT }, - { name: 'Question', dataType: configure.dataType.TEXT }, - { name: 'Answer', dataType: configure.dataType.TEXT} - ], - // Define your Cohere vectorizer and generative model - vectorizers: weaviate.configure.vectors.text2VecCohere(), - // highlight-start - generative: weaviate.configure.generative.cohere() - // highlight-start -}); -// END Collection - -} - - try { -// START Importing -let jeopardyCollection = client.collections.use('JeopardyQuestion'); - -// Download data to import into the "JeopardyQuestion" collection -const url = 'https://raw.githubusercontent.com/weaviate/weaviate-examples/main/jeopardy_small_dataset/jeopardy_tiny.json' -const response = await fetch(url); -const jeopardyQuestions = await response.json(); - -// Bulk insert downloaded data into the "JeopardyQuestion" collection -await jeopardyCollection.data.insertMany(jeopardyQuestions.data) - -console.log('Data Imported'); -// END Importing - } catch (e) { - console.error(e); - } - - const myCollection = client.collections.use('JeopardyQuestion'); -// START SingleGenerative -const genResult = await myCollection.generate.nearText("african elephant in savanna", { - singlePrompt: "translate {answer} into french for me", -}) - -for (const item of genResult.objects) { - console.log("Single generated concept:", item.generated); -} -// END SingleGenerative - -// START GroupedGenerative -const groupedGenResult = await myCollection.generate.nearText("african elephant in savanna", { - groupedTask: "Summarize all the results received into a single informational paragraph?", - groupedProperties: ["answer"] -}) - -console.log("Grouped generated concept:", groupedGenResult.generated); -// END GroupedGenerative diff --git a/docs/academy/js/standalone/using-ml-models/index.md b/docs/academy/js/standalone/using-ml-models/index.md deleted file mode 100644 index 53ffb6ada..000000000 --- a/docs/academy/js/standalone/using-ml-models/index.md +++ /dev/null @@ -1,44 +0,0 @@ ---- -title: Using Machine Learning Models in Weaviate -sidebar_position: 10 ---- - -# Using Machine Learning Models in Weaviate - -## Overview - -Weaviate leverages two fundamental types of machine learning models to power AI-native applications: - -1. **Embedding Models** - Transform data into high-dimensional vector representations -2. **Generative Models** - Create new content based on input prompts and context - -This guide will help you understand how these models can be set up in Weaviate, briefly covering the functioning of these models on a high level as well. - -We will look at how to use the search enabled by the two types of embedding models supported in Weaviate; **Text embeddings** and **Multimodal embeddings**. - -This guide will also explore practical applications ranging from semantic search to agentic RAG applications. - - - -### Prerequisites - -- A Node.js environment with `weaviate-client` installed -- Basic understanding of Weaviate's search functionality -- Intermediate JavaScript programming skills -- You must have completed the [quickstart](/weaviate/quickstart) - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/standalone/which-search/05_review.mdx b/docs/academy/js/standalone/which-search/05_review.mdx deleted file mode 100644 index d79753aae..000000000 --- a/docs/academy/js/standalone/which-search/05_review.mdx +++ /dev/null @@ -1,118 +0,0 @@ ---- -title: Review of search types ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/05_review.ts'; - -## Overview - -Weaviate offers three primary search types - namely **vector**, **keyword**, and **hybrid** searches. Let's briefly recap what they are, and how they work. - -### Code examples - -These code examples are runnable, with the [`v3` Weaviate Typescript client](../../../../weaviate/client-libraries/typescript/index.mdx). Connect to the pre-configured demo instance of Weaviate with the following code, and try the examples below. - - - - - - - - - -## Vector search - -A vector search finds objects with the most similar vectors to the query vector. - -Because each vector is a numerical representation of the underlying object, a vector similarity can be thought of as a similarity in meaning. Therefore a vector search is also called "semantic search". - -In Weaviate, you can search for objects with similar vectors in any of the following ways: - -With a source medium (e.g. text or image): - - - - - - - - - -With a vector: - - - - - - - - - -With an existing Weaviate object: - - - - - - - - - -## Keyword search - -A keyword search finds objects whose keywords (i.e. tokens) are the most relevant to the keywords (i.e. tokens) of the query. The relevance is determined by the [BM25F algorithm](https://en.wikipedia.org/wiki/Okapi_BM25). - -Intuitively, the BM25F algorithm determines "relevance" by considering how often a keyword appears in each field of the object, relative to how commonly the keyword appears in the entire dataset. - - - - - - - - - -## Hybrid search - -A hybrid search combines the results of a vector search and a keyword search. This is done by performing both searches, and them combining the two search results with a "fusion" algorithm. - - - - - - - - diff --git a/docs/academy/js/standalone/which-search/10_strengths.mdx b/docs/academy/js/standalone/which-search/10_strengths.mdx deleted file mode 100644 index cfc7803c8..000000000 --- a/docs/academy/js/standalone/which-search/10_strengths.mdx +++ /dev/null @@ -1,203 +0,0 @@ ---- -title: Strengths of each search type ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/10_strengths.ts'; - - -## Overview - -These different search types are offered because they each have different characteristics, and therefore different strengths. - -Let's explore the relative strengths of each search type. - -## Strengths of vector search - -### Robustness - -A vector is a numerical representation of the underlying object's meaning. As a result, a vector search is robust to any changes that don't affect the meaning of the object. - -More concretely, a vector of "cat", for example, will be similar to a vector of "kitten", "feline", and "pet", even though their spellings are very different. - -See this in action below, where we search for "cat" and "kitten" using vector search. - - - - - - - - - -You see that the results for "cat" and "kitten" are very similar. - -In other words, the vectors for "cat" and "kitten" are similar in meaning, because the model can "understand" meaning. - - - -Similarly, a vector of "cat" is similar to the vector of "cat" with a spelling mistake, such as "caat", or "catt". - - - - - - - - - -Here, the results are basically identical. - - - -This robustness is a key strength of vector search, as it means that the searcher does not need to know the exact words used in the dataset. This is particularly useful when the concepts being searched for are not well-defined, or when the searcher is not familiar with the dataset. - -### Versatility - -A vector search is also versatile. It can be used to search multiple data modalities (e.g. text, images, audio, etc.), and across multiple languages. - -This is because the vector is a numerical representation of the underlying object's meaning, and therefore the same vector can be used to represent the same meaning, regardless of the data modality or language. - -In fact, some models are capable of search across multiple data modalities, or multiple languages (or both!). This is made possible by using a model that can convert the data into comparable vectors, regardless of the data modality or language. - -![Vectorization across multiple modalities](./_img/multimodal_example.png) - -## Strengths of keyword search - -### Exactitude - -Keyword search is ideal for situations where locating precise matches are required. This is particularly useful in scenarios where there exist exact terms to search for, such as academic research, searches through domain-specific data or technical troubleshooting. - -The ability to return results that precisely match the search terms ensures that users receive the most relevant information for their specific queries. - -More concretely, take a look at the example below, where we search for "imaging". - - - - - - - - - -And when we inspect the results: - - - -A search for "imaging" using a keyword search returns the one result that contains that specific word. - - -## Strengths of hybrid search - -A key strength of hybrid search is its resiliency. Let's explore this in more detail. - -### Resiliency - -A hybrid search is resilient as it combines top results from both vector and keyword search. This helps to mitigate either search's shortcomings. - -Take a look at the hybrid search example below. - - - - - - - - - -We can inspect the results here: - - - -You can see that as well as the keyword search result above (for "imaging"), we get a semantically relevant result (for "X-rays"). - -Because hybrid search combines the results of both vector and keyword search, it will find objects that score well on at least one of the search types. This approach has the effect of complementing each search type. - - - diff --git a/docs/academy/js/standalone/which-search/20_selection.mdx b/docs/academy/js/standalone/which-search/20_selection.mdx deleted file mode 100644 index cb12999bc..000000000 --- a/docs/academy/js/standalone/which-search/20_selection.mdx +++ /dev/null @@ -1,104 +0,0 @@ ---- -title: Selecting the right search type ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/20_selection.ts'; - - -## Overview - -Selecting the right search type is key for effective search tasks. Let's explore how to select the right search type for your needs. - - -## Rules of thumb - - -### When to use vector search - -Vector search is the most robust and versatile search type. As such, it is well-suited for situations where the the meaning, or the vector representation, is of the highest importance. - -In cross-modal, object-based or multi-lingual searches, vector search may be the only viable option. - -Start with vector search for: - -- **Non-text, or cross-modal searches**: Essential for searching across different types of media, like finding images using text descriptions or vice versa. -- **Object-based searches**: For finding similar objects to an extracted text chunk, image, or video, vector search is likely the only viable solution. -- **Multi-lingual contexts**: The go-to choice for handling searches in multiple languages, where traditional keyword-based search may fall short. -- **Complex query understanding**: Vector search excels in interpreting and responding to complex queries that require understanding context or nuances in language. - -### When to use keyword search - -Keyword search is useful when there is an expectation or requirement to match the exact search terms. This can be the case for specific domains such as legal, medical or technical areas where the exact terminology is important. - -Keyword search is also useful when the user is unlikely to make mistakes in inputs and is inputting a predictable set of terms, such as through a sanitized form or a drop-down menu. - -In summary, start with keyword search for: - -**Exact term matching**: Ideal in domains like legal, medical, or technical fields where specific terminology is crucial. -**Predictable user inputs**: Works well when users are expected to input a defined set of terms, like through forms or drop-down menus. -**Simple and direct queries**: Effective for straightforward search needs where the complexity of natural language processing is not required. -**Fast and specific results**: Suitable for quick retrieval of information based on specific keywords or phrases. - -### When to use hybrid search - -Hybrid search is a great choice for "messy" situations. - -Because hybrid search combines results sets from both vector and keyword searches, it is able to provide a good balance between the robustness of vector search and the exactitude of keyword search. - -As a result, hybrid search is a generally good choice for most search needs that do not fall into the specific use cases of vector or keyword search. - -In summary, consider hybrid search for: - -- **Broad topic ranges**: Effective in scenarios where the target corpus covers a wide array of subjects, requiring a versatile search approach. -- **Versatile search scenarios**: Useful for real-life scenarios that often require a combination of results from both vector and keyword searches. -- **Unpredictable user inputs**: Ideal for many real-life scenarios where the user has free reign over the query. Some user queries may be aimed at direct matches while others' queries may be more about the overall meaning. - - - diff --git a/docs/academy/js/standalone/which-search/30_strategies.mdx b/docs/academy/js/standalone/which-search/30_strategies.mdx deleted file mode 100644 index f4cd6d9cd..000000000 --- a/docs/academy/js/standalone/which-search/30_strategies.mdx +++ /dev/null @@ -1,162 +0,0 @@ ---- -title: Strategies to improve search results ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!./_snippets/30_strategies.ts'; - - -## Overview - -In addition to selecting the right search types, there are also strategies you can employ to improve the quality of your search results. - -Let's explore some of these strategies. - -## Improve vector search - -The key to improving vector search is to make sure that the vector representation of the object is fit for purpose, so as to suit the search needs. - -### Vectorizer selection - -Unless you are inserting data with your own vectors, you will be using a Weaviate vectorizer module, and a model within that module, to generate vectors for your data. - -The choice of vectorizer module and model is important, as it will determine what aspects of the data are captured in the vector representation, and how well the model is able to "understand" the data. - -First and foremost, you should select a vectorizer module that is best suited for your data type. For example, if you are working with text data, you should use the `text2vec` module, and if you are using image or multi-modal data, you should likely use the `multi2vec` module. - -We will cover vectorizer selection in another unit. But, if you are not sure where to start, try: -- `text2vec-cohere`, or `text2vec-openai` for text data (API-based) - - Cohere offers a multi-lingual model that can be used with over 100 languages. -- `multi2vec-clip` for image or image and text data. - -If you are working with text and prefer to run a local inference container, try `text2vec-transformers`, with a popular model such as `sentence-transformers/all-MiniLM-L12-v2`. - -### Try a re-ranker - -Re-ranker modules are a great way to improve the quality of your search results. - -A re-ranker module is a module that takes in the results of a vector search, and re-ranks the results based on additional criteria, or a different model. This allows a higher-quality (but slower) model to be used for re-ranking, while still benefiting from the fast first stage search. - -For example, you can use the `text2vec-cohere` module to perform a vector search, and then use the `reranker-cohere` module to re-rank the results using a different model. - -### Property selection - -Vectorization captures the "meaning" of the object. Accordingly, if a property is not relevant to the criteria to be applied for search, it should be excluded from the vectorization process. - -As an example, if a product object includes metadata such as its manufacturing process or location, and the vector search is intended to be based on the product's features, then the properties for manufacturing process and location should be excluded from the vectorization process. - -You can do this by specifying whether to skip a property during vectorization, as shown below. Note that you can do the same with the collection name, and the property name. - - - - - - - - - -### Chunking - -Chunking refers to the process of splitting a text into smaller chunks, and vectorizing each chunk separately. This is very important, as it defines how much information each vector contains. - -As a rule of thumb, the more granular the search needs, the smaller the chunk size should be. For example, if you are searching for specific concepts and ideas, you should chunk data into smaller units such as sentences or small windows of text. Alternatively, if you are searching for broader concepts, such as finding relevant chapters or books, you might chunk text accordingly. - -## Improve keyword search - -### Tokenization - -Although we refer to BM25 search as a "keyword" search, in reality the exact matches are for "tokens", rather than words. This is a different tokenization process to that used for generating vector embeddings, but instead, it is used to build the inverted index for BM25 searches and filtering. - -Accordingly, the tokenization process is very important, as it determines what tokens are used for matching. - -The available options are: `word`, `lowercase`, `whitespace`, and `field`. The default (`word`) might be sufficient for prose, but for text where exact matches including case and symbols are important, something like `whitespace` might be more appropriate. - -Available tokenization options: - -import TokenizationDefinition from '/_includes/tokenization_definition.mdx'; - - - -You can set tokenization in the collection configuration. - - - - - - - - - -### Select and boost properties - -If you observe that matches in some properties are having too much of an impact, you can exclude them from the search, and/or boost the importance certain properties. - -For example, matches in the `description` property might be more important than matches in the `notes` property. You can specify this at query time. - - - - - - - - - -## Improve hybrid search - -### Alpha - -The alpha parameter determines the balance between the vector and keyword search results. - -If you want to configure your search to be more vector-based, you can increase the alpha value. Conversely, if you want to configure your search to be more keyword-based, you can decrease the alpha value. - - - - - - - - - -### Fusion algorithm - -The fusion algorithm determines how the results from the vector and keyword searches are combined. - -By default, an inverse of the ranks from each results set are summed, in what is called the "ranked fusion" algorithm. However, you can also use the "relative score fusion" algorithm, which sums normalized scores from each results set. - -Generally, we have found that the "relative score fusion" algorithm works better, but you should try both to see which works best for your use case. - - - - - - - - - diff --git a/docs/academy/js/standalone/which-search/_30_improve_search.mdx b/docs/academy/js/standalone/which-search/_30_improve_search.mdx deleted file mode 100644 index 2447a084e..000000000 --- a/docs/academy/js/standalone/which-search/_30_improve_search.mdx +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Improving search ---- - -- Evaluating search quality -- Improving search quality - - Vectorizer - - Data structure - - Search type -- Improving search speed - - Inference speed - - Resource demands - - Query bottlenecks diff --git a/docs/academy/js/standalone/which-search/_img/multimodal_example.png b/docs/academy/js/standalone/which-search/_img/multimodal_example.png deleted file mode 100644 index 628d4d5be..000000000 Binary files a/docs/academy/js/standalone/which-search/_img/multimodal_example.png and /dev/null differ diff --git a/docs/academy/js/standalone/which-search/_snippets/05_review.ts b/docs/academy/js/standalone/which-search/_snippets/05_review.ts deleted file mode 100644 index 7be057a22..000000000 --- a/docs/academy/js/standalone/which-search/_snippets/05_review.ts +++ /dev/null @@ -1,112 +0,0 @@ -import weaviate, { WeaviateClient, WeaviateReturn } from "weaviate-client" -// START connectionCode -let client: WeaviateClient - - -client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL as string, { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string, // Replace with your inference API key - } -} -) -// END connectionCode - - -// START nearTextExample // START nearVectorExample // START nearObjectExample // START bm25Example // START hybridExample -type NonGenericReturn = WeaviateReturn -let response: NonGenericReturn - -const questions = client.collections.use("JeopardyQuestion") - -// END nearTextExample // END nearVectorExample // END nearObjectExample // END bm25Example // END hybridExample - -// START nearTextExample -response = questions.query.nearText("space travel", // Your query string - { - limit: 2 - } -) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.properties) -} -// END nearTextExample -// assert len(response.objects) == 2 -// assert "question" in response.objects[0].properties.keys() - -response = questions.query.nearText("space travel", { - limit: 1, - includeVector: true -} -) -let vectorInput = response.objects[0].vectors -let objectInput = response.objects[0].uuid - -// START nearVectorExample -response = questions.query.nearVector(vectorInput, // Your vectors - { - limit: 2 - } -) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.properties) -} -// END nearVectorExample - -// assert len(response.objects) == 2 -// assert "question" in response.objects[0].properties.keys() - -// START nearObjectExample -response = questions.query.nearObject(objectInput, // Your object UUID - { - limit: 2 - } -) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.properties) -} -// END nearObjectExample - - -// assert len(response.objects) == 2 -// assert "question" in response.objects[0].properties.keys() - -// START bm25Example -response = questions.query.bm25("space travel", // Your query string - { - limit: 2 - } -) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.properties) -} -// END bm25Example - -// assert len(response.objects) == 2 -// assert "question" in response.objects[0].properties.keys() - - -// START hybridExample -response = questions.query.hybrid("space travel", // Your query string - { - limit: 2 - } -) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.properties) -} - -// END hybridExample - -// assert len(response.objects) == 2 -// assert "question" in response.objects[0].properties.keys() diff --git a/docs/academy/js/standalone/which-search/_snippets/10_strengths.ts b/docs/academy/js/standalone/which-search/_snippets/10_strengths.ts deleted file mode 100644 index 8315bf1db..000000000 --- a/docs/academy/js/standalone/which-search/_snippets/10_strengths.ts +++ /dev/null @@ -1,164 +0,0 @@ -import weaviate, { WeaviateClient, WeaviateReturn } from "weaviate-client" - -let client: WeaviateClient -type NonGenericReturn = WeaviateReturn -let response: NonGenericReturn -// # END-ANY - -client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string, // Replace with your inference API key - } - } - ) - -// START robustnessExampleWords -for (const query of ["cat", "kitten"]) { - const question = client.collections.use("JeopardyQuestion") - - response = await question.query.nearText(query,{ - limit: 1, - returnMetadata: ['distance'], - returnProperties: ["question", "answer"] - }) - - for (const item of response.objects) { - console.log(`\n===== Search results for ${query} =====`) - console.log("Distance:", item.metadata?.distance) - console.log(item.properties) - } -} -// END robustnessExampleWords -// assert length(response.objects) == 1 -// assert "question" in response.objects[0].properties.keys() - - -const example = ` -// START responseRobustnessExampleWords -===== Search results for cat ===== -Distance: 0.170 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} - -===== Search results for kitten ===== -Distance: 0.150 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} -// END responseRobustnessExampleWords -` - -// START robustnessExampleSpelling -for (const query of ["cat", "catt", "caat"]) { - const question = client.collections.use("JeopardyQuestion") - - response = await question.query.nearText(query,{ - limit: 1, - returnMetadata: ['distance'], - returnProperties: ["question", "answer"] - }) - - for (const item of response.objects) { - console.log(`\n===== Search results for ${query} =====`) - console.log("Distance:", item.metadata?.distance) - console.log(item.properties) - } -} - -// END robustnessExampleSpelling - -// assert len(response.objects) == 1 -// assert "question" in response.objects[0].properties.keys() - - -const example2 = ` -// START responseRobustnessExampleSpelling -===== Search results for cat ===== -Distance: 0.170 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} - -===== Search results for catt ===== -Distance: 0.177 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} - -===== Search results for caat ===== -Distance: 0.184 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} -// END responseRobustnessExampleSpelling -` - -// START bm25Example // START hybridExample -const question = client.collections.use("JeopardyQuestion") -// END bm25Example // END hybridExample - -// START bm25Example - -response = await question.query.bm25("imaging",{ // Your query string - limit: 2, - returnProperties: ["question", "answer"] - }) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.properties) -} -// END bm25Example - -// assert "question" in response.objects[0].properties.keys() - -const example3 = ` -// START bm25Results -49fe3d7c-61a5-5916-99bb-052d07c7c251 -{ - "answer": "magnetic resonance imaging", - "question": "MRI, which stands for this, cannot be used on patients with pacemakers or artificial metal joints" -} -// END bm25Results -` - -// START hybridExample - -response = await question.query.hybrid("imaging", { // Your query string - limit: 2, - returnMetadata: ["score"], - returnProperties: ["question", "answer"] - }) - -for (const item of response.objects) { - console.log(item.uuid) - console.log(item.metadata?.score) - console.log(item.properties) -} - -// END hybridExample - -// assert "question" in response.objects[0].properties.keys() - - -` -// START hybridResults -49fe3d7c-61a5-5916-99bb-052d07c7c251 -{ - "answer": "magnetic resonance imaging", - "question": "MRI, which stands for this, cannot be used on patients with pacemakers or artificial metal joints" -} -9041bce6-b5d1-5637-bcbe-2ebb8a689fe0 -{ - "answer": "X-rays", - "question": "These electromagnetic rays used to take pictures of your insides were originally known as Roentgen rays" -} -// END hybridResults -` diff --git a/docs/academy/js/standalone/which-search/_snippets/20_selection.ts b/docs/academy/js/standalone/which-search/_snippets/20_selection.ts deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/academy/js/standalone/which-search/_snippets/30_strategies.ts b/docs/academy/js/standalone/which-search/_snippets/30_strategies.ts deleted file mode 100644 index 45c1ab252..000000000 --- a/docs/academy/js/standalone/which-search/_snippets/30_strategies.ts +++ /dev/null @@ -1,127 +0,0 @@ -import weaviate, { CollectionConfig, CollectionConfigCreate, WeaviateClient, WeaviateReturn } from "weaviate-client" - -let client: WeaviateClient - -client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string, // Replace with your inference API key - } - } -) - -client.collections.delete("Product") - -// START skipVectorizationExample // START tokenizationExample -let products: CollectionConfigCreate -// END skipVectorizationExample // END tokenizationExample - -// START skipVectorizationExample -products = await client.collections.create({ - name: "Product", - vectorizers: weaviate.configure.vectors.text2VecOpenAI({ - // highlight-start - vectorizeCollectionName: true - // highlight-end - }), - properties: [{ - name: "name", - dataType: weaviate.configure.dataType.TEXT, - // highlight-start - vectorizePropertyName: true - // highlight-end - }, - { - name: "description", - dataType: weaviate.configure.dataType.TEXT, - }, - { - name: "manufacturing_process", - dataType: weaviate.configure.dataType.TEXT, - // highlight-start - skipVectorization: true, // Skip unwanted property - //highlight-end - }] -}) -// END skipVectorizationExample - -client.collections.delete("Product") - -client.collections.delete("SomeCollection") - -// START tokenizationExample - -products = await client.collections.create({ - name: "SomeCollection", - properties: [{ - name: "name", - dataType: weaviate.configure.dataType.TEXT, - // highlight-start - tokenization: weaviate.configure.tokenization.WORD // Default - // highlight-end - }, - { - name: "description", - dataType: weaviate.configure.dataType.TEXT, - tokenization: weaviate.configure.tokenization.WHITESPACE // Will keep case & special characters - }, - { - name: "email", - dataType: weaviate.configure.dataType.TEXT, - // highlight-start - tokenization: weaviate.configure.tokenization.FIELD // Do not tokenize at all - // highlight-end - }] -}) -// END tokenizationExample - - -client.collections.delete("SomeCollection") - -// START selectAndBoostExample // START adjustAlpha // START changeFusionType -type NonGenericReturn = WeaviateReturn -let response: NonGenericReturn - -const questions = client.collections.use("JeopardyQuestion") - -// END selectAndBoostExample // END adjustAlpha // END changeFusionType - - -// START selectAndBoostExample -response = questions.query.bm25("animal",{ // Your query string - limit: 5, - queryProperties: ["question^3", "answer"] // Boost the impact of "question" property by 3 - } -) - -for (const item of response.objects) { - console.log(item.properties) -} -// END selectAndBoostExample - -// START adjustAlpha -response = questions.query.hybrid("imaging",{ // Your query string - limit: 5, - alpha: 0.1, // Mostly a vector search (Try it with alpha=0.9) - } -) - -for (const item of response.objects) { - console.log(item.properties) -} -// END adjustAlpha - - - -// START changeFusionType -response = questions.query.hybrid("imaging",{ // Your query string - limit: 5, - fusionType: "RelativeScore", - alpha: 0.1, // Mostly a vector search (Try it with alpha=0.9) - } -) - -for (const item of response.objects) { - console.log(item.properties) -} -// END changeFusionType diff --git a/docs/academy/js/standalone/which-search/index.md b/docs/academy/js/standalone/which-search/index.md deleted file mode 100644 index 2107465cb..000000000 --- a/docs/academy/js/standalone/which-search/index.md +++ /dev/null @@ -1,50 +0,0 @@ ---- -title: Which search is right for me? -sidebar_position: 10 ---- - -## Unit overview - - - - - - - -Weaviate offers three distinct search methods - namely **vector**, **keyword**, and **hybrid** searches. - -Each method has its unique strengths and applicabilities, making the selection critical to the success of your search-related tasks. - -This section compares these search types to equip you with the knowledge to intuit when and why to employ each of these search methodologies. - -We will explore how the choice of search type impacts not only the quality of the search results but also the overall performance of the search operation. - -Then, we will also discuss strategies to improve the quality of search results, as well as the performance of the search operation. - - -### Prerequisites - -- A Node.js environment with `weaviate-client` installed. -- Familiarity with Weaviate's search capabilities. -- Intermediate coding proficiency (e.g. JavaScript). - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/10_client.mdx b/docs/academy/js/starter_multimodal_data/101_setup_weaviate/10_client.mdx deleted file mode 100644 index 67bc24408..000000000 --- a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/10_client.mdx +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: Weaviate Typescript client -description: Client Setup for Multimodal Data in Weaviate ---- - -## Installation - -The latest Weaviate TypeScript client library can be installed using npm. The client library is tested on Node v18 and later. Install it using the following command: - -```bash -npm install weaviate-client -``` - -The latest major version is `v3` (e.g. `3.x.x`). You can check the version like so: - -```bash -npm view weaviate-client version -``` - -## Basic usage - -You can import the Weaviate client library like so: - -```typescript -import weaviate, { generateUuid5, ApiKey } from "weaviate-client" -``` - -The client provides sets of helper functions (e.g. `generateUuid5, ApiKey`) to make it easier to interact with Weaviate. - -Next, we'll show you how create a Weaviate instance and connect to it. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx b/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx deleted file mode 100644 index edb85e0d9..000000000 --- a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "Option 1: A cloud WCD instance" -description: "Create a Weaviate instance on WCS for scalable, cloud-based data projects." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../../_snippets/101_connect.mts'; - -Here, you will create a Weaviate Cloud (WCD) instance. WCD is a fully managed Weaviate instance that runs in the cloud. It's a great way to get started with Weaviate, as it requires no installation or maintenance. - -### Log in to the WCD Console - -Go to the [WCD Console](https://console.weaviate.cloud/) and log in with your credentials. If you don't have an account yet, you can sign up by clicking on the Register here link from the login screen. - -### Create a Weaviate instance - -From the console, go to the Dashboard and click on the Create cluster button. From the following screen: - -- Select the "Free sandbox" tab -- Provide a cluster name -- Set "Enable authentication" to "Yes" - -Click on the Create button to create your Weaviate instance. The process will take a few minutes. - -### Retrieve your Weaviate instance details - -Once the instance is created, you will be able see its details by clicking on the Details button. Find the cluster URL and the API key. - -You will need these details to connect to your Weaviate instance. - -### Connect to your WCD instance - -To connect to the Weaviate Cloud (WCD) instance, you need to use the cluster URL and the API key. You can find these details in the WCD Console. - -Use the `connectToWeaviateCloud()` function to connect to your WCD instance. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses VoyageAI, so you can provide the VoyageAI API key to Weaviate through `headers: {"X-VoyageAI-Api-Key": }` as shown below: - - - -:::note What next? -If you have completed this, you can skip the next page [Option 2: A local Weaviate instance](./20_create_docker.mdx) and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx b/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx deleted file mode 100644 index af3632a70..000000000 --- a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx +++ /dev/null @@ -1,93 +0,0 @@ ---- -title: "Option 2: A local Docker instance" -description: "Set up Weaviate with Docker for a quick and customizable local deployment." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../../_snippets/101_connect.mts'; - -:::note Have you already created a Weaviate instance? -If you have [created a cloud instance](./10_create_wcs.mdx) of Weaviate, you can skip this page and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -Here, you will create a Weaviate instance using Docker. - -### Download and run the docker-compose file - -Install Docker on your machine. We recommend following the [official Docker installation guide](https://docs.docker.com/get-docker/). - -Create a new directory and navigate to it in your terminal. Then, create a new file called `docker-compose.yml` and add the following content: - -```yaml ---- -version: '3.4' -services: - weaviate_anon: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - restart: on-failure:0 - environment: - COHERE_APIKEY: $COHERE_APIKEY - VOYAGEAI_APIKEY: $VOYAGEAI_APIKEY - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - DEFAULT_VECTORIZER_MODULE: 'none' - ENABLE_MODULES: 'text2vec-cohere,multi2vec-voyageai,generative-cohere' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node1' -... -``` - -### Create a Weaviate instance - -Run the following command to start Weaviate: - -```bash -docker compose up -d -``` - -### Your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -### Connect to your Weaviate instance - -To connect to the Weaviate instance, use the `connectToLocal()` function. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses VoyageAI, so you can provide the VoyageAI API key to Weaviate through `headers: {"X-VoyageAI-Api-Key": }` as shown below: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/index.mdx b/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/index.mdx deleted file mode 100644 index 53c348780..000000000 --- a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/20_create_instance/index.mdx +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Create a Weaviate instance -description: "Create a new Weaviate instance tailored for text data applications." ---- - -For this unit, you can choose to create a Weaviate Cloud (WCD) instance or a local Docker instance. - -- [Create a Weaviate Cloud (WCD) instance](./10_create_wcs.mdx) - - If you want a managed service and don't want to worry about installation and maintenance. -- [Create a local Docker instance](./20_create_docker.mdx) - - If you want to run Weaviate on your local machine, or want to have full control over the installation and maintenance. - -Either option is fine for this course. If you're not sure which to choose, we recommend starting with a WCD instance. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/30_communicate.mdx b/docs/academy/js/starter_multimodal_data/101_setup_weaviate/30_communicate.mdx deleted file mode 100644 index 9f190e9a8..000000000 --- a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/30_communicate.mdx +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: Communicate with Weaviate -description: Communication Setup for Multimodal Data ---- -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/101_connect.mts'; - -Here, we'll perform basic operations to communicate with Weaviate using the TypeScript client library. - -### Check Weaviate status - -You can check whether the Weaviate instance is up using the `isLive` function. - - - -### Retrieve server meta information - -You can retrieve meta information about the Weaviate instance using the `getMeta` function. - - - -This will print the server meta information to the console. The output will look similar to the following: - -
- Example getMeta() output - - -
- -### Close the connection - -After you have finished using the Weaviate client, you should close the connection. This frees up resources and ensures that the connection is properly closed. - -We suggest using a `try`-`finally` block as a best practice. For brevity, we will not include the `try`-`finally` blocks in the remaining code snippets. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/index.mdx b/docs/academy/js/starter_multimodal_data/101_setup_weaviate/index.mdx deleted file mode 100644 index 0c72f66c4..000000000 --- a/docs/academy/js/starter_multimodal_data/101_setup_weaviate/index.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Set up Weaviate -description: Weaviate Setup for Multimodal Data ---- - - - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/102_mm_collections/10_preparation.mdx b/docs/academy/js/starter_multimodal_data/102_mm_collections/10_preparation.mdx deleted file mode 100644 index 55ff47bc9..000000000 --- a/docs/academy/js/starter_multimodal_data/102_mm_collections/10_preparation.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Preparation ---- - -In this section you are going to populate your Weaviate instance with a movie dataset, using the multi-modal, Cohere Embed 3 models to embed the text and image data. - -### Weaviate instance - -Make sure to have your Weaviate instance set up. You should have [created an instance](../101_setup_weaviate/20_create_instance/index.mdx) and be able to connect to it. - -### Source data - -We are going to use a movie dataset sourced from [TMDB](https://www.themoviedb.org/). The dataset can be found in this [GitHub repository](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json), and it contains bibliographic information on ~700 movies released between 1990 and 2024. - -As a multimodal project, we'll also use [corresponding posters for each movie](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_posters.zip), which are available in the same repository. - -
- See sample text data - -| | backdrop_path | genre_ids | id | original_language | original_title | overview | popularity | poster_path | release_date | title | video | vote_average | vote_count | -|---:|:---------------------------------|:----------------|-----:|:--------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------:|:---------------------------------|:---------------|:----------------------------|:--------|---------------:|-------------:| -| 0 | /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg | [14, 18, 10749] | 162 | en | Edward Scissorhands | A small suburban town receives a visit from a castaway unfinished science experiment named Edward. | 45.694 | /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg | 1990-12-07 | Edward Scissorhands | False | 7.7 | 12305 | -| 1 | /sw7mordbZxgITU877yTpZCud90M.jpg | [18, 80] | 769 | en | GoodFellas | The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway. | 57.228 | /aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg | 1990-09-12 | GoodFellas | False | 8.5 | 12106 | -| 2 | /6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg | [35, 10751] | 771 | en | Home Alone | Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. But when a pair of bungling burglars set their sights on Kevin's house, the plucky kid stands ready to defend his territory. By planting booby traps galore, adorably mischievous Kevin stands his ground as his frantic mother attempts to race home before Christmas Day. | 3.538 | /onTSipZ8R3bliBdKfPtsDuHTdlL.jpg | 1990-11-16 | Home Alone | False | 7.4 | 10599 | -| 3 | /vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg | [12, 35, 878] | 196 | en | Back to the Future Part III | The final installment of the Back to the Future trilogy finds Marty digging the trusty DeLorean out of a mineshaft and looking for Doc in the Wild West of 1885. But when their time machine breaks down, the travelers are stranded in a land of spurs. More problems arise when Doc falls for pretty schoolteacher Clara Clayton, and Marty tangles with Buford Tannen. | 28.896 | /crzoVQnMzIrRfHtQw0tLBirNfVg.jpg | 1990-05-25 | Back to the Future Part III | False | 7.5 | 9918 | -| 4 | /3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg | [35, 10749] | 114 | en | Pretty Woman | When a millionaire wheeler-dealer enters a business contract with a Hollywood hooker Vivian Ward, he loses his heart in the bargain. | 97.953 | /hVHUfT801LQATGd26VPzhorIYza.jpg | 1990-03-23 | Pretty Woman | False | 7.5 | 7671 | - -
- -Next, you will create a corresponding object collection and import the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/102_mm_collections/20_create_collection.mdx b/docs/academy/js/starter_multimodal_data/102_mm_collections/20_create_collection.mdx deleted file mode 100644 index 37f8d088b..000000000 --- a/docs/academy/js/starter_multimodal_data/102_mm_collections/20_create_collection.mdx +++ /dev/null @@ -1,89 +0,0 @@ ---- -title: Create a collection -description: Creating Multimodal Data Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/102_collection.mts'; - -Weaviate stores data in "collections". A collection is a set of objects that share the same data structure. In our movie database, we might have a collection of movies, a collection of actors, and a collection of reviews. - -Here we will create a collection of movies. - -## Code - -This example creates a collection for the movie data: - - - -Each collection definition must have a name. Then, you can define additional parameters like we've done in this example. - -## Explain the code - -### Properties - -Properties are the object attributes that you want to store in the collection. Each property has a name and a data type. - -In our movie database, we have properties like `title`, `release_date` and `genre_ids`, with data types like `TEXT` (string), `DATE` (date), or `INT` (integer). It's also possible to have arrays of integers, like we have with `genre_ids`. - -As a multimodal object, we also have the `poster` property which is the image data, which is saved as a `BLOB` (binary large object) data type. - -#### Auto-schema - -Weaviate can automatically [infer the schema](/weaviate/config-refs/collections#auto-schema) from the data. However, it's a good practice to define the properties explicitly, for better control and to avoid surprises. - -### Vectorizer configuration - -If you do not specify the vector yourself, Weaviate will use a specified vectorizer to generate vector embeddings from your data. - -In this code example, we specify the `multi2vec-voyageai` module. This module uses the **voyage-multimodal-3** model to generate vector embeddings from the text and image data. - -You can specify any number of text and image properties to be used for vectorization, and weight them differently. The weights are used to determine the relative importance of each property in the vector embedding generation process. In this example, we vectorize the `poster` property (an image) with a 90% weight and the `title` property (a string) with a 10% weight. - - - -### Generative configuration - -If you wish to use your collection with a generative model (e.g. a large language model), you must specify the generative module. - -In this code example, we specify the `cohere` module (`generative-cohere` is the full name) with default options. - - - -import MutableGenerativeConfig from '/_includes/mutable-generative-config.md'; - - - -### TypeScript named imports - -The code example makes use of named imports such as `vectorizer` and `configure`. They are defined in the `weaviate` module and are used during the collection definition. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/102_mm_collections/30_import_data.mdx b/docs/academy/js/starter_multimodal_data/102_mm_collections/30_import_data.mdx deleted file mode 100644 index 3d274c01d..000000000 --- a/docs/academy/js/starter_multimodal_data/102_mm_collections/30_import_data.mdx +++ /dev/null @@ -1,107 +0,0 @@ ---- -title: Import data -description: Importing Data into Multimodal Data Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/102_collection.mts'; - -## Code - -This example imports the movie data into our collection. - - - -The code: -- Loads the source text and image data -- Gets the collection -- Loops through the data and: - - Finds corresponding image to the text - - Converts the image to base64 - - Bulk inserts objects in batches of 20 -- Prints out any import errors - -## Explain the code - -### Preparation - -We use the native Node.js `fetch()` to load the data from the source, in this case a JSON file containing text data and a Zip file containing posters. The text data is then converted to a JSON object for easier manipulation and the images are extracted from the Zip file. - -Then, we create a collection object (with `client.collections.get`) so we can interact with the collection. - -### Iterating over data - -The `for` loop is used in conjunction with `Object.keys()` to iterate through the elements in our JSON file. While iterating we increment the counter variable that lets us bulk insert objects in batches. - - - - -### Add data to the Object - -#### Convert data types and build the Object - -The data is converted from a string to the correct data types for Weaviate. For example, the `release_date` is converted to a `Date` object, and the `genre_ids` are converted to a list of integers. - - - -To save the image data as a `BLOB` (binary large object) data type, we convert the image to base64 using the helpful `toBase64FromMedia` utility that comes with the Weaviate client. - - - -After converting data to the correct format, we build the object by its properties preparing it to be inserted into Weaviate. - -#### Bulk insert data - -Then we create on object that includes the uuid generated with `generateUuid5` from Weaviate and the object containing properties we previously define, we push this object to `itemsToInsert` for them to be bulk inserted with `insertMany()` once the batch is ready. - - - -### Error handling - -If you have any errors in your bulk insertion, you want to know something went wrong. That way you can decide how to handle them, such as by raising an exception. In this example, we simply print out that there was an error with the import. - - - -## Where do the vectors come from? - -When sends the items to Weaviate, the objects are added to the collection. In our case, the movie collection. - -Recall that the collection has a vectorizer module, and we do not specify vectors here. So Weaviate uses the specified vectorizer to generate vector embeddings from the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/102_mm_collections/index.mdx b/docs/academy/js/starter_multimodal_data/102_mm_collections/index.mdx deleted file mode 100644 index 5ad92db5a..000000000 --- a/docs/academy/js/starter_multimodal_data/102_mm_collections/index.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Populate the database -description: Multimodal Data Collections Overview ---- - - - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/103_mm_searches/10_multimodal.mdx b/docs/academy/js/starter_multimodal_data/103_mm_searches/10_multimodal.mdx deleted file mode 100644 index d8c03e94a..000000000 --- a/docs/academy/js/starter_multimodal_data/103_mm_searches/10_multimodal.mdx +++ /dev/null @@ -1,145 +0,0 @@ ---- -title: Multimodal search -description: Multimodal Search Methodology ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/103_searches.mts'; - -With Weaviate, you can perform semantic searches to find similar items based on their meaning. This is done by comparing the vector embeddings of the items in the database. - -As we are using a multimodal model, we can search for objects based on their similarity to any of the supported modalities. Meaning that we can search for movies based on their similarity to a text or an image. - -## Image query - -### Code - -This example finds entries in "Movie" based on their similarity to [this image of the International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg), and prints out the title and release year of the top 5 matches. - -
- Query image - -![International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg) - -
- - - -### Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object. In this case, the vectorizer module generates an embedding of the input image. - -The `limit` parameter here sets the maximum number of results to return. - -The `returnMetadata` parameter takes an array of strings to set metadata to return in the search results. The current query returns the vector distance to the query. - - -
- Example results - -Posters for the top 5 matches: -Interstellar -Gravity -Arrival -Armageddon -Godzilla - -Weaviate output: - -```text -Interstellar 2014 157336 -Distance to query: 0.354 - -Gravity 2013 49047 -Distance to query: 0.384 - -Arrival 2016 329865 -Distance to query: 0.386 - -Armageddon 1998 95 -Distance to query: 0.400 - -Godzilla 1998 929 -Distance to query: 0.441 -``` - -
- -### Response object - -The returned object is an instance of a custom class. Its `objects` attribute is a list of search results, each object being an instance of another custom class. - -Each returned object will: -- Include all properties and its UUID by default except those with blob data types. - - Since the `poster` property is a blob, it is not included by default. - - To include the `poster` property, you must specify it and the other properties to fetch in the `returnProperties` parameter. -- Not include any other information (e.g. references, metadata, vectors.) by default. - - -## Text search - -### Code - -This example finds entries in "Movie" based on their similarity to the query "red", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object. In this case, the vectorizer module generates an embedding of the input text. - -The remaining parameters are the same as in the previous example. - -
- Example results - -Posters for the top 5 matches: -Deadpool 2 -Bloodshot -Deadpool -300 -The Hunt for Red October - -Weaviate output: - -```text -Deadpool 2 2018 383498 -Distance to query: 0.670 - -Bloodshot 2020 338762 -Distance to query: 0.677 - -Deadpool 2016 293660 -Distance to query: 0.678 - -300 2007 1271 -Distance to query: 0.682 - -The Hunt for Red October 1990 1669 -Distance to query: 0.683 -``` - -
- -### Response object - -The returned object is in the same format as in the previous example. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/103_mm_searches/20_keyword_hybrid.mdx b/docs/academy/js/starter_multimodal_data/103_mm_searches/20_keyword_hybrid.mdx deleted file mode 100644 index a73e85720..000000000 --- a/docs/academy/js/starter_multimodal_data/103_mm_searches/20_keyword_hybrid.mdx +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: Keyword & Hybrid search -description: Hybrid Keyword Searches in Multimodal Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/103_searches.mts'; - -You can also perform keyword (BM25) searches to find items based on their keyword similarity, or hybrid searches that combine BM25 and semantic/vector searches. - -## Keyword search - -### Code - -This example finds entries in "Movie" with the highest keyword search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a keyword search score using what's called the [BM25f](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm. - -The `limit` parameter here sets the maximum number of results to return. - -The `returnMetadata` parameter takes an array of strings to set metadata to return in the search results. The current query returns the `score`, which is the BM25 score of the result. - -
- Example results - -```text -American History X 1998 -BM25 score: 2.707 - -A Beautiful Mind 2001 -BM25 score: 1.896 - -Legends of the Fall 1994 -BM25 score: 1.663 - -Hacksaw Ridge 2016 -BM25 score: 1.554 - -Night at the Museum 2006 -BM25 score: 1.529 -``` - -
- - -## Hybrid search - -### Code - -This example finds entries in "Movie" with the highest hybrid search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a hybrid search score. A hybrid search blends results of BM25 and semantic/vector searches. - -The `limit` parameter here sets the maximum number of results to return. - -The `returnMetadata` parameter takes an array of strings to set metadata to return in the search results. The current query returns the `score`, which is the hybrid score of the result. - -
- Example results - -```text -Legends of the Fall 1994 -Hybrid score: 0.016 - -Hacksaw Ridge 2016 -Hybrid score: 0.016 - -A Beautiful Mind 2001 -Hybrid score: 0.015 - -The Butterfly Effect 2004 -Hybrid score: 0.015 - -Night at the Museum 2006 -Hybrid score: 0.012 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/103_mm_searches/30_filters.mdx b/docs/academy/js/starter_multimodal_data/103_mm_searches/30_filters.mdx deleted file mode 100644 index b9ad7aadd..000000000 --- a/docs/academy/js/starter_multimodal_data/103_mm_searches/30_filters.mdx +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Filters -description: Filters for Multimodal Searches ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/103_searches.mts'; - -Filters can be used to precisely refine search results. You can filter by properties as well as metadata, and you can combine multiple filters with `and` or `or` conditions to further narrow down the results. - -### Code - -This example finds entries in "Movie" based on their similarity to the query "dystopian future", only from those released after 2010. It prints out the title and release year of the top 5 matches. - - - -## Explain the code - -This query is identical to [that shown earlier](./10_multimodal.mdx) for search, but with the addition of a filter. The `filters` parameter makes use of the `filter` namespace to set the filter conditions. The current query filters the results to only include those with a release year after 2010. - -
- Example results - -```text -Dune 2021 -Distance to query: 0.199 - -Tenet 2020 -Distance to query: 0.200 - -Mission: Impossible - Dead Reckoning Part One 2023 -Distance to query: 0.207 - -Onward 2020 -Distance to query: 0.214 - -Jurassic World Dominion 2022 -Distance to query: 0.216 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/103_mm_searches/index.mdx b/docs/academy/js/starter_multimodal_data/103_mm_searches/index.mdx deleted file mode 100644 index a12cd2600..000000000 --- a/docs/academy/js/starter_multimodal_data/103_mm_searches/index.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Perform searches -description: Multimodal Searches Overview ---- - - - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/104_mm_rag/10_setup.mdx b/docs/academy/js/starter_multimodal_data/104_mm_rag/10_setup.mdx deleted file mode 100644 index ecd42c2c1..000000000 --- a/docs/academy/js/starter_multimodal_data/104_mm_rag/10_setup.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "RAG: Overview" -description: Setting up Multimodal RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/102_collection.mts'; - -### Motivation - -Retrieval augmented generation (RAG) is a way to combine the best of both worlds: the retrieval capabilities of semantic search and the generation capabilities of AI models such as large language models. This allows you to retrieve objects from a Weaviate instance and then generate outputs based on the retrieved objects. - -### Setup - -When we created a collection, we specified the `generative` parameter as shown here. - - - -This selects a generative module that will be used to generate outputs based on the retrieved objects. In this case, we're using the `cohere` module, and the `command` family of large language models. - -As we did before with the vectorizer module, you will require an API key from the provider of the generative module. In this case, you will need an API key from Cohere. - -### RAG queries - -RAG queries are also called 'generative' queries in Weaviate. You can access these functions through the `generate` submodule of the collection object. - -Each generative query works in addition to the regular search query, and will perform a RAG query on each retrieved object. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/104_mm_rag/20_single_prompt.mdx b/docs/academy/js/starter_multimodal_data/104_mm_rag/20_single_prompt.mdx deleted file mode 100644 index 5ac928190..000000000 --- a/docs/academy/js/starter_multimodal_data/104_mm_rag/20_single_prompt.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: "'Single prompt' generation" -description: Single Prompt for Multimodal RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/104_rag.mts'; - -A 'single prompt' generation wil perform RAG queries on each retrieved object. This is useful when you want to transform each object separately, with the same prompt. - -### Code - -This example finds entries in "Movie" based on their similarity to [this image of the International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg). Then, instructs the large language model to translate the title of each movie into French. - -Each of the results are then printed out to the console. - - - -## Explain the code - -You must pass on one or more properties to the `singlePrompt` parameter through braces, as we've done here with `"... {title} ..."`. This will instruct Weaviate to pass on the `title` property from each retrieved object to the large language model. - -
- Example results - -```text -Interstellar -Interstellaire -Gravity -Gravité -Arrival -Arrivée -Armageddon -Armageddon -Godzilla -Godzilla -``` - -
- -### Response object - -Each response object is similar to that from a regular search query, with an additional `generated` attribute. This attribute will contain the generated output for each object. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/104_mm_rag/30_grouped_task.mdx b/docs/academy/js/starter_multimodal_data/104_mm_rag/30_grouped_task.mdx deleted file mode 100644 index 2bee636b3..000000000 --- a/docs/academy/js/starter_multimodal_data/104_mm_rag/30_grouped_task.mdx +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: "'Grouped task' generation" -description: Grouped Task for Multimodal RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/104_rag.mts'; - -A 'grouped task' generation wil perform RAG queries on the set of retrieved objects. This is useful when you want to transform the set of objects as a whole, with one prompt. - -### Code - -This example finds entries in "MovieMM" based on their similarity to [this image of the International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg). Then, instructs the large language model to find commonalities between them. - -Each of the results are then printed out to the console. - - - -## Explain the code - -For `groupedTask` queries, you simply pass on the prompt to the `groupedTask` parameter. This will instruct Weaviate to pass on the: -- text properties from all retrieved objects, and -- the prompt - -to the large language model. - -
- Example results - -```text -Interstellar -Gravity -Arrival -Armageddon -Godzilla -These movies all involve space exploration, extraterrestrial beings, or catastrophic events threatening Earth. They all deal with themes of survival, human ingenuity, and the unknown mysteries of the universe. -``` - -
- -### Optional parameters - -You can also pass on a list of properties to be used, as the `groupedProperties` parameter. This can be useful to reduce the amount of data passed on to the large language model and omit irrelevant properties. - -### Response object - -A RAG query with the `groupedTask` parameter will return a response with an additional `generated` attribute. This attribute will contain the generated output for the set of objects. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/104_mm_rag/index.mdx b/docs/academy/js/starter_multimodal_data/104_mm_rag/index.mdx deleted file mode 100644 index 6ff132817..000000000 --- a/docs/academy/js/starter_multimodal_data/104_mm_rag/index.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: LLMs and Weaviate (RAG) -description: Multimodal Retrieval-Augmented Generation (RAG) Overview ---- - - - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/900_next_steps.mdx b/docs/academy/js/starter_multimodal_data/900_next_steps.mdx deleted file mode 100644 index 0ae070810..000000000 --- a/docs/academy/js/starter_multimodal_data/900_next_steps.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Next steps ---- - -import IntroNextSteps from '../_snippets/intro_next_steps_js.mdx'; - - diff --git a/docs/academy/js/starter_multimodal_data/_snippets/101_connect.mts b/docs/academy/js/starter_multimodal_data/_snippets/101_connect.mts deleted file mode 100644 index 1aecc4488..000000000 --- a/docs/academy/js/starter_multimodal_data/_snippets/101_connect.mts +++ /dev/null @@ -1,171 +0,0 @@ -// DockerInstantiation // WCDInstantiation // WCDAPIKeyInstantiation // DockerInstantiation // DockerAPIKeyInstantiation // TryFinallyCloseDemo -import weaviate, { WeaviateClient } from "weaviate-client"; -let client: WeaviateClient; -// END DockerInstantiation // END WCDInstantiation // END WCDAPIKeyInstantiation // END DockerInstantiation // END DockerAPIKeyInstantiation // END TryFinallyCloseDemo - - -// WCDInstantiation - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string, { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), -}) - -// END WCDInstantiation - -client.close() - -// WCDAPIKeyInstantiation - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string, { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string, // Replace with your inference API key - } -}) -// END WCDAPIKeyInstantiation - -client.close() - -// DockerInstantiation - -client = await weaviate.connectToLocal() -// END DockerInstantiation - -client.close() - -// DockerAPIKeyInstantiation - -client = await weaviate.connectToLocal({ - host: '...', - headers: { - 'X-VoyageAI-Api-Key': process.env.VOYAGEAI_APIKEY as string, // Replace with your inference API key - } -}) -// END DockerAPIKeyInstantiation - -// PollLiveness -if (await client.isLive()) { - // Run operations with the client if it is live -} -// END PollLiveness - - -// GetMeta -console.log(await client.getMeta()) -// END GetMeta - - -const outputString = -// OutputGetMeta -{ - hostname: 'http://[::]:8080', - modules: { - 'backup-gcs': { - bucketName: 'weaviate-wcs-prod-cust-europe-west3-workloads-backups', - rootName: '55a78146-dae1-4609-90ce-556db01f4a61' - }, - 'generative-anyscale': { - documentationHref: 'https://docs.anyscale.com/endpoints/overview', - name: 'Generative Search - Anyscale' - }, - 'generative-aws': { - documentationHref: 'https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html', - name: 'Generative Search - AWS' - }, - 'generative-cohere': { - documentationHref: 'https://docs.cohere.com/reference/chat', - name: 'Generative Search - Cohere' - }, - 'generative-mistral': { - documentationHref: 'https://docs.mistral.ai/api/', - name: 'Generative Search - Mistral' - }, - 'generative-openai': { - documentationHref: 'https://platform.openai.com/docs/api-reference/completions', - name: 'Generative Search - OpenAI' - }, - 'generative-google': { - documentationHref: 'https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts', - name: 'Generative Search - Google' - }, - 'multi2vec-google': { - documentationHref: 'https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-multimodal-embeddings', - name: 'Google Multimodal Module' - }, - 'qna-openai': { - documentationHref: 'https://platform.openai.com/docs/api-reference/completions', - name: 'OpenAI Question & Answering Module' - }, - 'ref2vec-centroid': {}, - 'reranker-cohere': { - documentationHref: 'https://txt.cohere.com/rerank/', - name: 'Reranker - Cohere' - }, - 'reranker-voyageai': { - documentationHref: 'https://docs.voyageai.com/reference/reranker-api', - name: 'Reranker - VoyageAI' - }, - 'text2vec-aws': { - documentationHref: 'https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html', - name: 'AWS Module' - }, - 'text2vec-cohere': { - documentationHref: 'https://docs.cohere.ai/embedding-wiki/', - name: 'Cohere Module' - }, - 'text2vec-huggingface': { - documentationHref: 'https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task', - name: 'Hugging Face Module' - }, - 'text2vec-jinaai': { - documentationHref: 'https://jina.ai/embeddings/', - name: 'JinaAI Module' - }, - 'text2vec-openai': { - documentationHref: 'https://platform.openai.com/docs/guides/embeddings/what-are-embeddings', - name: 'OpenAI Module' - }, - 'text2vec-google': { - documentationHref: 'https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings', - name: 'Google Module' - }, - 'text2vec-voyageai': { - documentationHref: 'https://docs.voyageai.com/docs/embeddings', - name: 'VoyageAI Module' - } - }, - version: '1.25.5' -} -// END OutputGetMeta - - - -client.close() - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string, - { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string, // Replace with your inference API key - } - } -) - -// TryFinallyCloseDemo - -// Instantiate your client (not shown). e.g.: -// client = weaviate.connectToWeaviateCloud(...) or -// client = weaviate.connectToLocal(...) - -try { - if (await client.isLive()) { - // Work with the client here - // ... - } -} finally { // This will always be executed, even if an exception is raised - client.close() // Close the connection & release resources -} -// END TryFinallyCloseDemo diff --git a/docs/academy/js/starter_multimodal_data/_snippets/102_collection.mts b/docs/academy/js/starter_multimodal_data/_snippets/102_collection.mts deleted file mode 100644 index 0cfe1d6b3..000000000 --- a/docs/academy/js/starter_multimodal_data/_snippets/102_collection.mts +++ /dev/null @@ -1,210 +0,0 @@ -import { promises as fs } from 'fs'; -import { join } from 'path'; -import AdmZip from 'adm-zip'; -import { fileURLToPath } from 'url'; -import { dirname } from 'path'; -import 'dotenv/config' - -// CreateMovieCollection // SubmoduleImport // BatchImportData -import weaviate from "weaviate-client"; -// END BatchImportData // END CreateMovieCollection // END SubmoduleImport -// CreateMovieCollection // SubmoduleImport -import { WeaviateClient, configure, vectorizer, toBase64FromMedia } from "weaviate-client"; -// END CreateMovieCollection // END SubmoduleImport - - -// BatchImportData -import { generateUuid5 } from "weaviate-client"; - -// END BatchImportData - -// BatchImportData -let client: WeaviateClient; -// CreateMovieCollection // END BatchImportData - -// END CreateMovieCollection - -const weaviateURL = process.env.WEAVIATE_URL as string; -const weaviateApikey = process.env.WEAVIATE_API_KEY as string; -const cohereApiKey = process.env.COHERE_API_KEY as string; -const voyageApiKey = process.env.VOYAGEAI_API_KEY as string; - - - -// client = await weaviate.connectToWeaviateCloud(weaviateURL, { -// authCredentials: new weaviate.ApiKey(weaviateApikey), -// } -// ) -// CreateMovieCollection -// Instantiate your client (not shown). e.g.: -// const requestHeaders = {'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string,} -// client = weaviate.connectToWeaviateCloud(..., headers: requestHeaders) or -// client = weaviate.connectToLocal(..., headers: requestHeaders) - -// END CreateMovieCollection - -const requestHeaders = { 'X-VoyageAI-Api-Key': voyageApiKey, - 'X-Cohere-Api-Key': cohereApiKey - } - - -client = await weaviate.connectToWeaviateCloud(weaviateURL, { - authCredentials: new weaviate.ApiKey(weaviateApikey), - headers: requestHeaders -} -) -// END CreateMovieCollection - -// Actual instantiation - -client.collections.delete("Movie") - -// CreateMovieCollection -await client.collections.create({ - name: "Movie", - properties: [ - { name: "title", dataType: configure.dataType.TEXT }, - { name: "overview", dataType: configure.dataType.TEXT }, - { name: "vote_average", dataType: configure.dataType.NUMBER }, - { name: "genre_ids", dataType: configure.dataType.INT_ARRAY }, - { name: "release_date", dataType: configure.dataType.DATE }, - { name: "tmdb_id", dataType: configure.dataType.INT }, - { name: "poster", dataType: configure.dataType.BLOB } - ], - // Define the vectorizer module - vectorizers: vectorizer.multi2VecVoyageAI({ - imageFields: [{ name: "poster", weight: 0.9 }], - textFields: [{ name: "title", weight: 0.1 }], - model: "voyage-multimodal-3" - }), - // Define the generative module - generative: configure.generative.cohere(), - // END generativeDefinition // CreateMovieCollection -}) - -client.close() -// END CreateMovieCollection - -const weaviateURL = process.env.WEAVIATE_URL as string -const weaviateKey = process.env.WEAVIATE_API_KEY as string -const cohereKey = process.env.COHERE_API_KEY as string -const voyageaiKey = process.env.VOYAGEAI_API_KEY as string - -client = await weaviate.connectToWeaviateCloud(weaviateURL, { - authCredentials: new weaviate.ApiKey(weaviateKey), - headers: { - 'X-VoyageAI-Api-Key': voyageaiKey, // Replace with your inference API key - 'X-Cohere-Api-Key': cohereKey, // Replace with your inference API key - } -}) - -// BatchImportData - -// Instantiate your client (not shown). e.g.: -// client = weaviate.connectToWeaviateCloud(...) or -// client = weaviate.connectToLocal(...) - -// END BatchImportData - -// BatchImportData -const dataUrl = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -const textResponse = await fetch(dataUrl) -const data = await textResponse.json() - -// Get current file's directory -const __filename = fileURLToPath(import.meta.url); -const __dirname = dirname(__filename); -const imgDir = join(__dirname, "images"); - - -// Create directory if it doesn't exist -await fs.mkdir(imgDir, { recursive: true }); - -// Download images -const postersUrl = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_posters.zip"; -const postersPath = join(imgDir, "movies_data_1990_2024_posters.zip"); - -const response = await fetch(postersUrl); -if (!response.ok) { - throw new Error(`HTTP error! status: ${response.status}`); -} -const arrayBuffer = await response.arrayBuffer(); -const buffer = Buffer.from(arrayBuffer); - -// Write the zip file -await fs.writeFile(postersPath, buffer); - -// Unzip the files -const zip = new AdmZip(postersPath); -zip.extractAllTo(imgDir, true); - -// Get the collection -const movies = client.collections.use("Movie") - -// Set a counter and initialize Weaviate Object -let itemsToInsert: Object[] = [] -let counter = 0; - -// Iterate through data -for (const key of Object.keys(data['title'])) { - - counter++; - if (counter % 20 == 0) - console.log(`Import: ${counter}`) - // END Iterate through data // END BatchImportData - // BatchImportData - - let genreIds: [] - - // Format genre_ids and release_date - const parsedArray = JSON.parse(data['genre_ids'][key]); - genreIds = parsedArray.map((item: string) => parseInt(item, 10)); - let releaseDate = new Date(data['release_date'][key]) - - const imgPath = join(imgDir, `${data['id'][key]}_poster.jpg`) - // Convert poster to base64 - const posterBase64 = await toBase64FromMedia(imgPath) - - // Build the object payload - let movieObject = { - title: data['title'][key], - overview: data['overview'][key], - vote_average: data['vote_average'][key], - genre_ids: genreIds, - release_date: releaseDate, - tmdb_id: data['id'][key], - poster: posterBase64 - } - // Insert - let objectToInsert = { - properties: movieObject, - uuid: generateUuid5(data['title'][key]) - } - - // Add object to batching array - itemsToInsert.push(objectToInsert) - - if (itemsToInsert.length == 20) { - try { - const response = await movies.data.insertMany(itemsToInsert); - // END Insert - // Handle Errors // Insert - if (response.hasErrors) { - throw new Error("Error in batch import!"); - } - // END Insert // END Handle Errors - // Insert - console.log(`Successfully imported batch of ${itemsToInsert.length} items`); - itemsToInsert = []; - } catch (error) { - console.error('Error importing batch:', error); - } - } - // END BatchImportData // END Insert - // BatchImportData // Iterate through data - // ... other operations -} - - -client.close() -// END BatchImportData diff --git a/docs/academy/js/starter_multimodal_data/_snippets/103_searches.mts b/docs/academy/js/starter_multimodal_data/_snippets/103_searches.mts deleted file mode 100644 index 19710f93d..000000000 --- a/docs/academy/js/starter_multimodal_data/_snippets/103_searches.mts +++ /dev/null @@ -1,154 +0,0 @@ -import 'dotenv/config' -// START-ANY -import weaviate, { WeaviateClient, WeaviateReturn } from "weaviate-client"; -let client: WeaviateClient; -let response: WeaviateReturn -// END-ANY - -client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string, // Replace with your inference API key - 'X-Cohere-Api-Key': process.env.COHERE_API_KEY as string, // Replace with your inference API key - - } - } -) - -// START-ANY - -// Instantiate your client (not shown). e.g.: -// const requestHeaders = {'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string,} -// client = weaviate.connectToWeaviateCloud(..., headers: requestHeaders) or -// client = weaviate.connectToLocal(..., headers: requestHeaders) - -async function urlToBase64(imageUrl: string) { - const response = await fetch(imageUrl); - const arrayBuffer = await response.arrayBuffer(); - const content = Buffer.from(arrayBuffer); - return content.toString('base64'); -} -// END-ANY - -// MetadataMultimodalSearch // SinglePromptGeneration // MetadataSemanticSearch // MetadataBM25Search // MetadataHybridSearch // FilteredSemanticSearch - -// Get the collection -const movies = client.collections.use("Movie") - -// Perform query -const srcImgPath = "https://github.com/weaviate-tutorials/edu-datasets/blob/main/img/International_Space_Station_after_undocking_of_STS-132.jpg?raw=true" -const queryB64 = await urlToBase64(srcImgPath) -// END MetadataMultimodalSearch // END SinglePromptGeneration // END MetadataSemanticSearch // END MetadataBM25Search // END MetadataHybridSearch // END FilteredSemanticSearch - -// MetadataMultimodalSearch - -response = await movies.query.nearImage(queryB64, { - limit: 5, - returnMetadata: ['distance'], - returnProperties: ["title", "tmdb_id", "release_date", "poster"] - }, -) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year (note the release date is a datetime object) - console.log(`${item.properties.title} - ${item.properties.release_date}`) - // Print the distance of the object from the query - console.log(`Distance to query: ${item.metadata?.distance}`) -} - -client.close() -// END MetadataMultimodalSearch - -console.log("\n\n") - - -// MetadataSemanticSearch -response = await movies.query.nearText("red", { - limit: 5, - returnMetadata: ['distance'], - returnProperties: ["title", "tmdb_id", "release_date"] - }, -) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year (note the release date is a datetime object) - console.log(`${item.properties.title} - ${item.properties.release_date}`) - // Print the distance of the object from the query - console.log(`Distance to query: ${item.metadata?.distance}`) -} - -client.close() -// END MetadataSemanticSearch - -// MetadataBM25Search - -response = await movies.query.bm25("history", { - limit: 5, - returnMetadata: ['score'], -}, -) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year (note the release date is a datetime object) - console.log(`${item.properties.title} - ${item.properties.release_date}`) - // Print the distance of the object from the query - console.log(`BM25 score: ${item.metadata?.score}`) -} - -client.close() -// END MetadataBM25Search - - -console.log("\n\n") - -// client.connect() - -// MetadataHybridSearch - -response = await movies.query.hybrid("history", { - limit: 5, - returnMetadata: ['score'], - returnProperties: ["title", "tmdb_id", "release_date"] -}, -) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year (note the release date is a datetime object) - console.log(`${item.properties.title} - ${item.properties.release_date}`) - // Print the hybrid score of the object from the query - console.log(`Hybrid score: ${item.metadata?.score}`) -} - -client.close() -// END MetadataHybridSearch - - -console.log("\n\n") - -// client.connect() - -// FilteredSemanticSearch - -const filterTime = new Date(2010, 1, 1) - -response = await movies.query.nearText("dystopian future", { - limit: 5, - returnMetadata: ['distance'], - filters: movies.filter.byProperty("release_date").greaterThan(filterTime) - } -) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year (note the release date is a datetime object) - console.log(`${item.properties.title} - ${item.properties.release_date}`) - // Print the distance of the object from the query - console.log(`Distance to query: ${item.metadata?.distance}`) -} - -client.close() -// END FilteredSemanticSearch diff --git a/docs/academy/js/starter_multimodal_data/_snippets/104_rag.mts b/docs/academy/js/starter_multimodal_data/_snippets/104_rag.mts deleted file mode 100644 index a48450725..000000000 --- a/docs/academy/js/starter_multimodal_data/_snippets/104_rag.mts +++ /dev/null @@ -1,88 +0,0 @@ -import 'dotenv/config' -// START-ANY -import weaviate, { GenerativeReturn, WeaviateClient } from "weaviate-client"; -let client: WeaviateClient; -let response: GenerativeReturn -// END-ANY - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string, - { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string, // Replace with your inference API key - } - } -) - -// START-ANY - -// Instantiate your client (not shown). e.g.: -// const requestHeaders = {'X-VoyageAI-Api-Key': process.env.VOYAGEAI_API_KEY as string,} -// client = weaviate.connectToWeaviateCloud(..., headers: requestHeaders) or -// client = weaviate.connectToLocal(..., headers: requestHeaders) - -async function urlToBase64(imageUrl: string) { - const response = await fetch(imageUrl); - const arrayBuffer = await response.arrayBuffer(); - const content = Buffer.from(arrayBuffer); - return content.toString('base64'); -} - -// END-ANY - -// SinglePromptGeneration // GroupedTaskGeneration -// Get the collection -const movies = client.collections.use("Movie") - -// Perform query -const srcImgPath = "https://github.com/weaviate-tutorials/edu-datasets/blob/main/img/International_Space_Station_after_undocking_of_STS-132.jpg?raw=true" -const queryB64 = await urlToBase64(srcImgPath) -// END SinglePromptGeneration // END GroupedTaskGeneration - -// SinglePromptGeneration - -response = await movies.generate.nearMedia(queryB64, "image",{ - // highlight-start - singlePrompt: "Translate this into French: {title}" - // highlight-end - }, { - limit: 5 -}) - -// Inspect the response -for (let item of response.objects) { - console.log(item.properties.title) - console.log(item.generated) -} - -client.close() -// END SinglePromptGeneration - - -console.log("\n\n") - - -// GroupedTaskGeneration - -response = await movies.generate.nearMedia(queryB64, "image",{ - // highlight-start - groupedTask: "What do these movies have in common?", - groupedProperties: ["title", "overview"] // Optional parameter; for reducing prompt length - // highlight-end - },{ - limit: 5 - } -) - -// Inspect the response -for (let item of response.objects) { - console.log('Title: ', item.properties.title) // Print the title -} - -// highlight-start -console.log(response.generated) // Print the generated text (the commonalities between them) -// highlight-end - -client.close() -// END GroupedTaskGeneration diff --git a/docs/academy/js/starter_multimodal_data/index.md b/docs/academy/js/starter_multimodal_data/index.md deleted file mode 100644 index 77910e550..000000000 --- a/docs/academy/js/starter_multimodal_data/index.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "101M Work with: Multimodal data" -description: Learn to handle multimodal data in Weaviate for diverse data integrations. -sidebar_position: 102 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -In this project-based course, you will learn how to work with multimodal data using Weaviate and a movie dataset. - -You will get hands-on experience on how to store and index text and image data to be searchable together by meaning, using Weaviate's vectorization capabilities. You will learn how to search through that data using multimodal search methods, as well as filters. You will also learn how to use Weaviate's retrieval augmented generation (RAG) capabilities to generate outputs based on the retrieved objects. - -## Learning objectives - - - -## Units - - - diff --git a/docs/academy/js/starter_text_data/101_setup_weaviate/10_client.mdx b/docs/academy/js/starter_text_data/101_setup_weaviate/10_client.mdx deleted file mode 100644 index 6dae25154..000000000 --- a/docs/academy/js/starter_text_data/101_setup_weaviate/10_client.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Weaviate JavaScript/Typescript client -description: "Configure the Weaviate JavaScript client for efficient data access and management." ---- - -## Installation - -The latest Weaviate TypeScript client library can be installed using npm. The client library is tested on Node v18 and later. Install it using the following command: - -```bash -npm install weaviate-client -``` - -The latest major version is `v3` (e.g. `3.x.x`). You can check the version like so: - -```bash -npm view weaviate-client version -``` - -## Basic usage - -You can import the Weaviate client library like so: - -```typescript -import weaviate, { generateUuid5, ApiKey } from "weaviate-client" -``` - -The client provides sets of helper functions (e.g. `generateUuid5, ApiKey`) to make it easier to interact with Weaviate. - -Next, we'll show you how create a Weaviate instance and connect to it. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx b/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx deleted file mode 100644 index 4ffe0e09e..000000000 --- a/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "Option 1: A cloud WCD instance" -description: "Create a Weaviate instance on WCS for scalable, cloud-based data projects." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../../_snippets/101_connect.ts'; - -Here, you will create a Weaviate Cloud (WCD) instance. WCD is a fully managed Weaviate instance that runs in the cloud. It's a great way to get started with Weaviate, as it requires no installation or maintenance. - -### Log in to the WCD Console - -Go to the [WCD Console](https://console.weaviate.cloud/) and log in with your credentials. If you don't have an account yet, you can sign up by clicking on the Register here link from the login screen. - -### Create a Weaviate instance - -From the console, go to the Dashboard and click on the Create cluster button. From the following screen: - -- Select the "Free sandbox" tab -- Provide a cluster name -- Set "Enable authentication" to "Yes" - -Click on the Create button to create your Weaviate instance. The process will take a few minutes. - -### Retrieve your Weaviate instance details - -Once the instance is created, you will be able see its details by clicking on the Details button. Find the cluster URL and the API key. - -You will need these details to connect to your Weaviate instance. - -### Connect to your WCD instance - -To connect to the Weaviate Cloud (WCD) instance, you need to use the cluster URL and the API key. You can find these details in the WCD Console. - -Use the `connectToWeaviateCloud()` function to connect to your WCD instance. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses OpenAI, so you can provide the OpenAI API key to Weaviate through `headers: {"X-OpenAI-Api-Key": }` as shown below: - - - -:::note What next? -If you have completed this, you can skip the next page [Option 2: A local Weaviate instance](./20_create_docker.mdx) and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx b/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx deleted file mode 100644 index 9ff2ee84d..000000000 --- a/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: "Option 2: A local Docker instance" -description: "Set up Weaviate with Docker for a quick and customizable local deployment." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../../_snippets/101_connect.ts'; - -:::note Have you already created a Weaviate instance? -If you have [created a cloud instance](./10_create_wcs.mdx) of Weaviate, you can skip this page and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -Here, you will create a Weaviate instance using Docker. - -### Download and run the docker-compose file - -Install Docker on your machine. We recommend following the [official Docker installation guide](https://docs.docker.com/get-docker/). - -Create a new directory and navigate to it in your terminal. Then, create a new file called `docker-compose.yml` and add the following content: - -```yaml ---- -version: '3.4' -services: - weaviate_anon: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - restart: on-failure:0 - environment: - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ENABLE_API_BASED_MODULES: 'true' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node1' -... -``` - -### Create a Weaviate instance - -Run the following command to start Weaviate: - -```bash -docker compose up -``` - -### Your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -### Connect to your Weaviate instance - -To connect to the Weaviate instance, use the `connectToLocal()` function. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses OpenAI, so you can provide the OpenAI API key to Weaviate through `headers: {"X-OpenAI-Api-Key": }` as shown below: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/index.mdx b/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/index.mdx deleted file mode 100644 index 53c348780..000000000 --- a/docs/academy/js/starter_text_data/101_setup_weaviate/20_create_instance/index.mdx +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Create a Weaviate instance -description: "Create a new Weaviate instance tailored for text data applications." ---- - -For this unit, you can choose to create a Weaviate Cloud (WCD) instance or a local Docker instance. - -- [Create a Weaviate Cloud (WCD) instance](./10_create_wcs.mdx) - - If you want a managed service and don't want to worry about installation and maintenance. -- [Create a local Docker instance](./20_create_docker.mdx) - - If you want to run Weaviate on your local machine, or want to have full control over the installation and maintenance. - -Either option is fine for this course. If you're not sure which to choose, we recommend starting with a WCD instance. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/101_setup_weaviate/30_communicate.mdx b/docs/academy/js/starter_text_data/101_setup_weaviate/30_communicate.mdx deleted file mode 100644 index bb5f23df0..000000000 --- a/docs/academy/js/starter_text_data/101_setup_weaviate/30_communicate.mdx +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: Communicate with Weaviate ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/101_connect.ts'; - -Here, we'll perform basic operations to communicate with Weaviate using the TypeScript client library. - -### Check Weaviate status - -You can check whether the Weaviate instance is up using the `isLive` function. - - - -### Retrieve server meta information - -You can retrieve meta information about the Weaviate instance using the `getMeta` function. - - - -This will print the server meta information to the console. The output will look similar to the following: - -
- Example getMeta() output - - -
- -### Close the connection - -After you have finished using the Weaviate client, you should close the connection. This frees up resources and ensures that the connection is properly closed. - -We suggest using a `try`-`finally` block as a best practice. For brevity, we will not include the `try`-`finally` blocks in the remaining code snippets. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/101_setup_weaviate/index.mdx b/docs/academy/js/starter_text_data/101_setup_weaviate/index.mdx deleted file mode 100644 index 95c249881..000000000 --- a/docs/academy/js/starter_text_data/101_setup_weaviate/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Set up Weaviate -description: "Learn to set up Weaviate for text data projects, including environment setup and configuration." ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/102_text_collections/10_preparation.mdx b/docs/academy/js/starter_text_data/102_text_collections/10_preparation.mdx deleted file mode 100644 index d02142077..000000000 --- a/docs/academy/js/starter_text_data/102_text_collections/10_preparation.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: Preparation -description: "Prepare text data for optimized storage and retrieval in Weaviate." ---- - -In this section you are going to populate your Weaviate instance with a movie dataset, using the OpenAI API to embed the text data. - -### Weaviate instance - -Make sure to have your Weaviate instance set up. You should have [created an instance](../101_setup_weaviate/20_create_instance/index.mdx) and be able to connect to it. - -### OpenAI key - -You will need an OpenAI API key to follow along. If you don't have one, go to the [OpenAI website](https://openai.com/) and sign up for an account and create an API key. - - - -### Source data - -We are going to use a movie dataset sourced from [TMDB](https://www.themoviedb.org/). The dataset can be found in this [GitHub repository](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json), and it contains bibliographic information on ~700 movies released between 1990 and 2024. - -
- See sample data - -| | backdrop_path | genre_ids | id | original_language | original_title | overview | popularity | poster_path | release_date | title | video | vote_average | vote_count | -|---:|:---------------------------------|:----------------|-----:|:--------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------:|:---------------------------------|:---------------|:----------------------------|:--------|---------------:|-------------:| -| 0 | /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg | [14, 18, 10749] | 162 | en | Edward Scissorhands | A small suburban town receives a visit from a castaway unfinished science experiment named Edward. | 45.694 | /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg | 1990-12-07 | Edward Scissorhands | False | 7.7 | 12305 | -| 1 | /sw7mordbZxgITU877yTpZCud90M.jpg | [18, 80] | 769 | en | GoodFellas | The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway. | 57.228 | /aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg | 1990-09-12 | GoodFellas | False | 8.5 | 12106 | -| 2 | /6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg | [35, 10751] | 771 | en | Home Alone | Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. But when a pair of bungling burglars set their sights on Kevin's house, the plucky kid stands ready to defend his territory. By planting booby traps galore, adorably mischievous Kevin stands his ground as his frantic mother attempts to race home before Christmas Day. | 3.538 | /onTSipZ8R3bliBdKfPtsDuHTdlL.jpg | 1990-11-16 | Home Alone | False | 7.4 | 10599 | -| 3 | /vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg | [12, 35, 878] | 196 | en | Back to the Future Part III | The final installment of the Back to the Future trilogy finds Marty digging the trusty DeLorean out of a mineshaft and looking for Doc in the Wild West of 1885. But when their time machine breaks down, the travelers are stranded in a land of spurs. More problems arise when Doc falls for pretty schoolteacher Clara Clayton, and Marty tangles with Buford Tannen. | 28.896 | /crzoVQnMzIrRfHtQw0tLBirNfVg.jpg | 1990-05-25 | Back to the Future Part III | False | 7.5 | 9918 | -| 4 | /3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg | [35, 10749] | 114 | en | Pretty Woman | When a millionaire wheeler-dealer enters a business contract with a Hollywood hooker Vivian Ward, he loses his heart in the bargain. | 97.953 | /hVHUfT801LQATGd26VPzhorIYza.jpg | 1990-03-23 | Pretty Woman | False | 7.5 | 7671 | - -
- -Next, you will create a corresponding object collection and import the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/102_text_collections/20_create_collection.mdx b/docs/academy/js/starter_text_data/102_text_collections/20_create_collection.mdx deleted file mode 100644 index 638ba6a62..000000000 --- a/docs/academy/js/starter_text_data/102_text_collections/20_create_collection.mdx +++ /dev/null @@ -1,86 +0,0 @@ ---- -title: Create a collection -description: "Create a text collection in Weaviate to store and manage structured text data." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/102_collection.ts'; - -Weaviate stores data in "collections". A collection is a set of objects that share the same data structure. In our movie database, we might have a collection of movies, a collection of actors, and a collection of reviews. - -Here we will create a collection of movies. - -## Code - -This example creates a collection for the movie data: - - - -Each collection definition must have a name. Then, you can define additional parameters like we've done in this example. - -## Explain the code - -### Properties - -Properties are the object attributes that you want to store in the collection. Each property has a name and a data type. - -In our movie database, we have properties like `title`, `release_date` and `genre_ids`, with data types like `TEXT` (string), `DATE` (date), or `INT` (integer). It's also possible to have arrays of integers, like we have with `genre_ids`. - -#### Auto-schema - -Weaviate can automatically [infer the schema](/weaviate/config-refs/collections.mdx#auto-schema) from the data. However, it's a good practice to define the properties explicitly, for better control and to avoid surprises. - -### Vectorizer configuration - -If you do not specify the vector yourself, Weaviate will use a specified vectorizer to generate vector embeddings from your data. - -In this code example, we specify the `text2vec-openai` module with default options. - - - -### Generative configuration - -If you wish to use your collection with a generative model (e.g. a large language model), you must specify the generative module. - -In this code example, we specify the `openai` module (`generative-openai` is the full name) with default options. - - - -import MutableGenerativeConfig from '/_includes/mutable-generative-config.md'; - - - -### TypeScript named imports - -The code example makes use of named imports such as `vectorizer` and `configure`. They are defined in the `weaviate` module and are used during the collection definition. - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/102_text_collections/30_import_data.mdx b/docs/academy/js/starter_text_data/102_text_collections/30_import_data.mdx deleted file mode 100644 index 638e21b4c..000000000 --- a/docs/academy/js/starter_text_data/102_text_collections/30_import_data.mdx +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: Import data -description: "Import data into Weaviate text collections to streamline data handling." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/102_collection.ts'; - -## Code - -This example imports the movie data into our collection. - - - -The code: -- Loads the source data & gets the collection -- Loops through the data and adds objects to the batcher -- Prints out any import errors - -## Explain the code - -### Preparation - -We use the requests library to load the data from the source, in this case a JSON file. - -Then, we create a collection object (with `client.collections.get`) so we can interact with the collection. - -### Iterating over data - -The `for` loop is used in conjunction with `Object.keys()` to iterate through the elements in our JSON file. While iterating we increment the counter variable that lets us bulk insert objects in batches. - - - - -### Add data to the Object - -#### Convert data types and build the Object - -The data is converted from a string to the correct data types for Weaviate. For example, the `release_date` is converted to a `Date` object, and the `genre_ids` are converted to a list of integers. - - - -After converting data to the correct format, we build the object by its properties preparing it to be inserted into Weaviate. - -#### Bulk insert data - -Then we create on object that includes the uuid generated with `generateUuid5` from Weaviate and the object containing properties we previously define, we push this object to `itemsToInsert` for them to be bulk inserted with `insertMany()` once the batch is ready.. - - - -### Error handling - -If you have any errors in your bulk insertion, you want to know something went wrong. That way you can decide how to handle them, such as by raising an exception. In this example, we simply print out that there was an error with the import. - - - -## Where do the vectors come from? - -When the batcher sends the queue to Weaviate, the objects are added to the collection. In our case, the movie collection. - -Recall that the collection has a vectorizer module, and we do not specify vectors here. So Weaviate uses the specified vectorizer to generate vector embeddings from the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/102_text_collections/index.mdx b/docs/academy/js/starter_text_data/102_text_collections/index.mdx deleted file mode 100644 index 7aba5e6c8..000000000 --- a/docs/academy/js/starter_text_data/102_text_collections/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Populate the database -description: "Learn to build text collections in Weaviate to organize and search data." ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/103_text_searches/10_semantic.mdx b/docs/academy/js/starter_text_data/103_text_searches/10_semantic.mdx deleted file mode 100644 index a01e76088..000000000 --- a/docs/academy/js/starter_text_data/103_text_searches/10_semantic.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: Semantic search -description: "Use semantic search in Weaviate for accurate and relevant text results." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/103_searches.ts'; - -With Weaviate, you can perform semantic searches to find similar items based on their meaning. This is done by comparing the vector embeddings of the items in the database. - -### Code - -This example finds entries in "Movie" based on their similarity to the query "dystopian future", and prints out the title and release year of the top 5 matches. - - - -## Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object text. In this case, the embeddings are generated by the vectorizer module. - -The `limit` parameter here sets the maximum number of results to return. - -The `returnMetadata` parameter takes an array of strings to set metadata to return in the search results. The current query returns the vector distance to the query. - -
- Example results - -```text -In Time 2011 -Distance to query: 0.179 - -Gattaca 1997 -Distance to query: 0.180 - -I, Robot 2004 -Distance to query: 0.182 - -Mad Max: Fury Road 2015 -Distance to query: 0.190 - -The Maze Runner 2014 -Distance to query: 0.193 -``` - -
- -### Response object - -The returned object is an instance of a custom class. Its `objects` attribute is a list of search results, each object being an instance of another custom class. - -Each returned object will: -- Include all properties and its UUID by default except those with blob data types. -- Not include any other information (e.g. references, metadata, vectors.) by default. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/103_text_searches/20_keyword_hybrid.mdx b/docs/academy/js/starter_text_data/103_text_searches/20_keyword_hybrid.mdx deleted file mode 100644 index c441bb164..000000000 --- a/docs/academy/js/starter_text_data/103_text_searches/20_keyword_hybrid.mdx +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: Keyword & Hybrid search -description: "Combine keyword and hybrid searches for efficient data retrieval in Weaviate." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/103_searches.ts'; - -You can also perform keyword (BM25) searches to find items based on their keyword similarity, or hybrid searches that combine BM25 and semantic/vector searches. - -## Keyword search - -### Code - -This example finds entries in "Movie" with the highest keyword search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a keyword search score using what's called the [BM25f](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm. - -The `limit` parameter here sets the maximum number of results to return. - -The `returnMetadata` parameter takes an array of strings to set metadata to return in the search results. The current query returns the `score`, which is the BM25 score of the result. - -
- Example results - -```text -American History X 1998 -BM25 score: 2.707 - -A Beautiful Mind 2001 -BM25 score: 1.896 - -Legends of the Fall 1994 -BM25 score: 1.663 - -Hacksaw Ridge 2016 -BM25 score: 1.554 - -Night at the Museum 2006 -BM25 score: 1.529 -``` - -
- - -## Hybrid search - -### Code - -This example finds entries in "Movie" with the highest hybrid search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a hybrid search score. A hybrid search blends results of BM25 and semantic/vector searches. - -The `limit` parameter here sets the maximum number of results to return. - -The `returnMetadata` parameter takes an array of strings to set metadata to return in the search results. The current query returns the `score`, which is the hybrid score of the result. - -
- Example results - -```text -Legends of the Fall 1994 -Hybrid score: 0.016 - -Hacksaw Ridge 2016 -Hybrid score: 0.016 - -A Beautiful Mind 2001 -Hybrid score: 0.015 - -The Butterfly Effect 2004 -Hybrid score: 0.015 - -Night at the Museum 2006 -Hybrid score: 0.012 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/103_text_searches/30_filters.mdx b/docs/academy/js/starter_text_data/103_text_searches/30_filters.mdx deleted file mode 100644 index 3b5f72155..000000000 --- a/docs/academy/js/starter_text_data/103_text_searches/30_filters.mdx +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Filters -description: "Apply filters to Weaviate text searches to narrow down query results." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/103_searches.ts'; - -Filters can be used to precisely refine search results. You can filter by properties as well as metadata, and you can combine multiple filters with `and` or `or` conditions to further narrow down the results. - -### Code - -This example finds entries in "Movie" based on their similarity to the query "dystopian future", only from those released after 2020. It prints out the title and release year of the top 5 matches. - - - -## Explain the code - -This query is identical to [that shown earlier](./10_semantic.mdx) for semantic search, but with the addition of a filter. The `filters` parameter makes use of the `filter` namespace to set the filter conditions. The current query filters the results to only include those with a release year after 2010. - -
- Example results - -```text -Dune 2021 -Distance to query: 0.199 - -Tenet 2020 -Distance to query: 0.200 - -Mission: Impossible - Dead Reckoning Part One 2023 -Distance to query: 0.207 - -Onward 2020 -Distance to query: 0.214 - -Jurassic World Dominion 2022 -Distance to query: 0.216 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/103_text_searches/index.mdx b/docs/academy/js/starter_text_data/103_text_searches/index.mdx deleted file mode 100644 index 999ee92b1..000000000 --- a/docs/academy/js/starter_text_data/103_text_searches/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Perform searches -description: "Explore text search capabilities in Weaviate for JavaScript applications." ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/104_text_rag/10_setup.mdx b/docs/academy/js/starter_text_data/104_text_rag/10_setup.mdx deleted file mode 100644 index e6e2bd92f..000000000 --- a/docs/academy/js/starter_text_data/104_text_rag/10_setup.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "RAG: Overview" -description: "Set up text RAG in Weaviate for reliable question answering." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/102_collection.ts'; - -### Motivation - -Retrieval augmented generation (RAG) is a way to combine the best of both worlds: the retrieval capabilities of semantic search and the generation capabilities of AI models such as large language models. This allows you to retrieve objects from a Weaviate instance and then generate outputs based on the retrieved objects. - -### Setup - -When we created a collection, we specified the `generative` parameter as shown here. - - - -This selects a generative module that will be used to generate outputs based on the retrieved objects. In this case, we're using the `openai` module, and the `GPT` family of large language models. - -As we did before with the vectorizer module, you will require an API key from the provider of the generative module. In this case, you will need an API key from OpenAI. - -### RAG queries - -RAG queries are also called 'generative' queries in Weaviate. You can access these functions through the `generate` submodule of the collection object. - -Each generative query works in addition to the regular search query, and will perform a RAG query on each retrieved object. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/104_text_rag/20_single_prompt.mdx b/docs/academy/js/starter_text_data/104_text_rag/20_single_prompt.mdx deleted file mode 100644 index bf8894647..000000000 --- a/docs/academy/js/starter_text_data/104_text_rag/20_single_prompt.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: "'Single prompt' generation" -description: "Use single prompts with text RAG in Weaviate for direct question answering." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/104_rag.ts'; - -A 'single prompt' generation will perform RAG queries on each retrieved object. This is useful when you want to transform each object separately, with the same prompt. - -### Code - -This example finds entries in "Movie" whose vector best matches the query vector (for "dystopian future"). Then, instructs the large language model to translate the title of each movie into French. - -Each of the results are then printed out to the console. - - - -## Explain the code - -You must pass on one or more properties to the `singlePrompt` parameter through braces, as we've done here with `"... {title} ..."`. This will instruct Weaviate to pass on the `title` property from each retrieved object to the large language model. - -
- Example results - -```text -In Time -À temps -Looper -Boucleur -I, Robot -Je, Robot -The Matrix -La Matrice -Children of Men -Les enfants des hommes -``` - -
- -### Response object - -Each response object is similar to that from a regular search query, with an additional `generated` attribute. This attribute will contain the generated output for each object. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/104_text_rag/30_grouped_task.mdx b/docs/academy/js/starter_text_data/104_text_rag/30_grouped_task.mdx deleted file mode 100644 index 460ad99a0..000000000 --- a/docs/academy/js/starter_text_data/104_text_rag/30_grouped_task.mdx +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: "'Grouped task' generation" -description: "Group tasks for text RAG to refine response generation in Weaviate." ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import TSCode from '!!raw-loader!../_snippets/104_rag.ts'; - -A 'grouped task' generation will perform RAG queries on the set of retrieved objects. This is useful when you want to transform the set of objects as a whole, with one prompt. - -### Code - -This example finds entries in "Movie" whose vector best matches the query vector (for "dystopian future"). Then, instructs the large language model to find commonalities between them. - -Each of the results are then printed out to the console. - - - -## Explain the code - -For `groupedTask` queries, you simply pass on the prompt to the `groupedTask` parameter. This will instruct Weaviate to pass on the: -- text properties from all retrieved objects, and -- the prompt - -to the large language model. - -
- Example results - -```text -In Time -Looper -I, Robot -The Matrix -Children of Men -These movies all involve futuristic settings and explore themes related to the manipulation of time, technology, and the potential consequences of advancements in society. They also touch on issues such as inequality, control, and the impact of human actions on the future of humanity. -``` - -
- -### Optional parameters - -You can also pass on a list of properties to be used, as the `groupedProperties` parameter. This can be useful to reduce the amount of data passed on to the large language model and omit irrelevant properties. - -### Response object - -A RAG query with the `groupedTask` parameter will return a response with an additional `generated` attribute. This attribute will contain the generated output for the set of objects. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/104_text_rag/index.mdx b/docs/academy/js/starter_text_data/104_text_rag/index.mdx deleted file mode 100644 index b76a4da32..000000000 --- a/docs/academy/js/starter_text_data/104_text_rag/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: LLMs and Weaviate (RAG) -description: "Implement text RAG in Weaviate to generate responses based on retrieved data." ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/js/starter_text_data/900_next_steps.mdx b/docs/academy/js/starter_text_data/900_next_steps.mdx deleted file mode 100644 index 0ae070810..000000000 --- a/docs/academy/js/starter_text_data/900_next_steps.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Next steps ---- - -import IntroNextSteps from '../_snippets/intro_next_steps_js.mdx'; - - diff --git a/docs/academy/js/starter_text_data/_snippets/101_connect.ts b/docs/academy/js/starter_text_data/_snippets/101_connect.ts deleted file mode 100644 index f95a2d57c..000000000 --- a/docs/academy/js/starter_text_data/_snippets/101_connect.ts +++ /dev/null @@ -1,172 +0,0 @@ -// WCDInstantiation // WCDAPIKeyInstantiation // DockerInstantiation // DockerAPIKeyInstantiation // TryFinallyCloseDemo -import weaviate, { WeaviateClient } from "weaviate-client"; -let client: WeaviateClient; -// END WCDInstantiation // END WCDAPIKeyInstantiation // END DockerInstantiation // END DockerAPIKeyInstantiation // END TryFinallyCloseDemo - - -// WCDInstantiation - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - } -) - -// END WCDInstantiation - -client.close() - -// WCDAPIKeyInstantiation - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string, // Replace with your inference API key - } - } -) -// END WCDAPIKeyInstantiation - -client.close() - -// DockerInstantiation - -client = await weaviate.connectToLocal() -// END DockerInstantiation - -client.close() - -// DockerAPIKeyInstantiation - -client = await weaviate.connectToLocal({ - host: '...', - headers: { - 'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string, // Replace with your inference API key -}}) -// END DockerAPIKeyInstantiation - -// PollLiveness -if (await client.isLive()) { - // Run operations with the client if it is live -} -// END PollLiveness - - -// GetMeta -console.log(await client.getMeta()) -// END GetMeta - - -const outputString = -// OutputGetMeta -{ - hostname: 'http://[::]:8080', - modules: { - 'backup-gcs': { - bucketName: 'weaviate-wcs-prod-cust-europe-west3-workloads-backups', - rootName: '55a78146-dae1-4609-90ce-556db01f4a61' - }, - 'generative-anyscale': { - documentationHref: 'https://docs.anyscale.com/endpoints/overview', - name: 'Generative Search - Anyscale' - }, - 'generative-aws': { - documentationHref: 'https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html', - name: 'Generative Search - AWS' - }, - 'generative-cohere': { - documentationHref: 'https://docs.cohere.com/reference/chat', - name: 'Generative Search - Cohere' - }, - 'generative-mistral': { - documentationHref: 'https://docs.mistral.ai/api/', - name: 'Generative Search - Mistral' - }, - 'generative-openai': { - documentationHref: 'https://platform.openai.com/docs/api-reference/completions', - name: 'Generative Search - OpenAI' - }, - 'generative-google': { - documentationHref: 'https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts', - name: 'Generative Search - Google' - }, - 'multi2vec-google': { - documentationHref: 'https://cloud.google.com/vertex-ai/generative-ai/docs/embeddings/get-multimodal-embeddings', - name: 'Google Multimodal Module' - }, - 'qna-openai': { - documentationHref: 'https://platform.openai.com/docs/api-reference/completions', - name: 'OpenAI Question & Answering Module' - }, - 'ref2vec-centroid': {}, - 'reranker-cohere': { - documentationHref: 'https://txt.cohere.com/rerank/', - name: 'Reranker - Cohere' - }, - 'reranker-voyageai': { - documentationHref: 'https://docs.voyageai.com/reference/reranker-api', - name: 'Reranker - VoyageAI' - }, - 'text2vec-aws': { - documentationHref: 'https://docs.aws.amazon.com/bedrock/latest/userguide/titan-embedding-models.html', - name: 'AWS Module' - }, - 'text2vec-cohere': { - documentationHref: 'https://docs.cohere.ai/embedding-wiki/', - name: 'Cohere Module' - }, - 'text2vec-huggingface': { - documentationHref: 'https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task', - name: 'Hugging Face Module' - }, - 'text2vec-jinaai': { - documentationHref: 'https://jina.ai/embeddings/', - name: 'JinaAI Module' - }, - 'text2vec-openai': { - documentationHref: 'https://platform.openai.com/docs/guides/embeddings/what-are-embeddings', - name: 'OpenAI Module' - }, - 'text2vec-google': { - documentationHref: 'https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings', - name: 'Google Module' - }, - 'text2vec-voyageai': { - documentationHref: 'https://docs.voyageai.com/docs/embeddings', - name: 'VoyageAI Module' - } - }, - version: '1.25.5' -} -// END OutputGetMeta - - - -client.close() - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string, - { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: { - 'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string, // Replace with your inference API key - } - } -) - -// TryFinallyCloseDemo - -// Instantiate your client (not shown). e.g.: -// client = weaviate.connectToWeaviateCloud(...) or -// client = weaviate.connectToLocal(...) - -try { - // Work with the client here - if (await client.isLive()) { - // ... - } -} finally { // This will always be executed, even if an exception is raised - client.close() // Close the connection & release resources -} -// END TryFinallyCloseDemo diff --git a/docs/academy/js/starter_text_data/_snippets/102_collection.ts b/docs/academy/js/starter_text_data/_snippets/102_collection.ts deleted file mode 100644 index a98943b65..000000000 --- a/docs/academy/js/starter_text_data/_snippets/102_collection.ts +++ /dev/null @@ -1,165 +0,0 @@ -// CreateMovieCollection // SubmoduleImport // BatchImportData -import weaviate from "weaviate-client"; -// END BatchImportData // END CreateMovieCollection // END SubmoduleImport -// CreateMovieCollection // SubmoduleImport -import { WeaviateClient, configure, vectors } from "weaviate-client"; -// END CreateMovieCollection // END SubmoduleImport - - -// BatchImportData -import { generateUuid5 } from "weaviate-client"; - -// END BatchImportData - -// BatchImportData -let client: WeaviateClient; -// CreateMovieCollection // END BatchImportData - -// END CreateMovieCollection - -client = await weaviate.connectToWeaviateCloud(process.env.WEAVIATE_URL as string,{ - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - } -) -// CreateMovieCollection -// Instantiate your client (not shown). e.g.: -// const requestHeaders = {'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string,} -// client = weaviate.connectToWeaviateCloud(..., headers: requestHeaders) or -// client = weaviate.connectToLocal(..., headers: requestHeaders) - -// END CreateMovieCollection - -const requestHeaders = {'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string,} - - -client = await weaviate.connectToWeaviateCloud( - process.env.WEAVIATE_URL as string, - { - authCredentials: new weaviate.ApiKey(process.env.WEAVIATE_API_KEY as string), - headers: requestHeaders - } -) - -// END CreateMovieCollection - -// Actual instantiation - -client.collections.delete("Movie") - -// CreateMovieCollection -await client.collections.create({ - name: "Movie", - properties: [ - { name: "title", dataType: configure.dataType.TEXT}, - { name: "overview", dataType: configure.dataType.TEXT}, - { name: "vote_average", dataType: configure.dataType.NUMBER}, - { name: "genre_ids", dataType: configure.dataType.INT_ARRAY}, - { name: "release_date", dataType: configure.dataType.DATE}, - { name: "tmdb_id", dataType: configure.dataType.INT}, - ], - // Define the vectorizer module - vectorizers: vectors.text2VecOpenAI(), - // Define the generative module - generative: configure.generative.openAI(), - // END generativeDefinition // CreateMovieCollection - }) - -client.close() -// END CreateMovieCollection - -const weaviateURL = process.env.WEAVIATE_URL as string -const weaviateKey = process.env.WEAVIATE_API_KEY as string -const openaiKey = process.env.OPENAI_API_KEY as string - -client = await weaviate.connectToWeaviateCloud(weaviateURL,{ - authCredentials: new weaviate.ApiKey(weaviateKey), - headers: { - 'X-OpenAI-Api-Key': openaiKey, // Replace with your inference API key - } - } -) - -// BatchImportData - -// Instantiate your client (not shown). e.g.: -// client = weaviate.connectToWeaviateCloud(...) or -// client = weaviate.connectToLocal(...) - -// END BatchImportData - -// BatchImportData -const dataUrl = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -const response = await fetch(dataUrl) -const data = await response.json() - -// Get the collection -const movies = client.collections.use("Movie") - -// Set a counter and initialize Weaviate Object -let itemsToInsert: Object[] = [] -let counter = 0; - -// Iterate through data -for (const key of Object.keys(data['title'])) { - counter++; - if(counter % 1000 == 0) - console.log(`Import: ${counter}`) - // END Iterate through data // END BatchImportData - // BatchImportData - - let genreIds: [] - - // Format genre_ids and release_date - const parsedArray = JSON.parse(data['genre_ids'][key]); - genreIds = parsedArray.map(item => parseInt(item, 10)); - let releaseDate = new Date(data['release_date'][key]) - - // Build the object payload - let movieObject = { - title: data['title'][key], - overview: data['overview'][key], - vote_average: data['vote_average'][key], - genre_ids: genreIds, - release_date: releaseDate, - tmdb_id: data['id'][key], - } - // Insert - let objectToInsert = { - properties: movieObject, - uuid: generateUuid5(data['title'][key]) - } - - // Add object to batching array - itemsToInsert.push(objectToInsert) - - if(itemsToInsert.length == 2000) { - // Batch insert 2000 items and clear batch array - const response = await movies.data.insertMany(itemsToInsert) - itemsToInsert = [] - if(response.hasErrors) { - throw new Error("Something went wrong in import!") - } - } - // END BatchImportData // END Insert - // BatchImportData // Iterate through data - // ... other operations -} -// END Iterate through data // END BatchImportData -// BatchImportData -// insert the remaining objects -if(itemsToInsert.length > 0) { - // Batch insert any remaining items - const response = await movies.data.insertMany(itemsToInsert) - console.log("Done Importing") - - // END BatchImportData - // Handle Errors // BatchImportData - if(response.hasErrors) { - throw new Error("Something went wrong in import!") - } - // END BatchImportData // END Handle Errors - // BatchImportData -} - -client.close() -// END BatchImportData diff --git a/docs/academy/js/starter_text_data/_snippets/103_searches.ts b/docs/academy/js/starter_text_data/_snippets/103_searches.ts deleted file mode 100644 index fa87b00a2..000000000 --- a/docs/academy/js/starter_text_data/_snippets/103_searches.ts +++ /dev/null @@ -1,108 +0,0 @@ -// # START-ANY -import weaviate, { WeaviateClient } from 'weaviate-client' - -let client: WeaviateClient -let response - -// # END-ANY - - -const weaviateURL = process.env.WEAVIATE_URL as string -const weaviateKey = process.env.WEAVIATE_API_KEY as string -const openaiKey = process.env.OPENAI_API_KEY as string - -// Connect to your Weaviate instance -client = await weaviate.connectToWeaviateCloud(weaviateURL, { -authCredentials: new weaviate.ApiKey(weaviateKey), -headers: { - 'X-OpenAI-Api-Key': openaiKey, // Replace with your inference API key -} -}) - -// # START-ANY -// Instantiate your client (not shown). e.g.: -// const requestHeaders = {'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string,} -// client = weaviate.connectToWeaviateCloud(..., headers: requestHeaders) or -// client = weaviate.connectToLocal(..., headers: requestHeaders) - -// # END-ANY - -// MetadataSemanticSearch -// Get the collection -// END MetadataSemanticSearch -// MetadataBM25Search // MetadataSemanticSearch // MetadataHybridSearch // FilteredSemanticSearch -const movies = client.collections.use("Movie") -// END MetadataBM25Search // END MetadataSemanticSearch // END MetadataHybridSearch // END FilteredSemanticSearch -// MetadataSemanticSearch - -// Perform query -response = await movies.query.nearText('dystopian future', { - limit: 5, - returnMetadata: ['distance'] -}) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year - console.log(`${item.properties.title}: ${item.properties.release_date.getUTCFullYear()} `) - // Print the distance of the object from the query - console.log(`Distance to query: ${item.metadata.distance}`) -} -// END MetadataSemanticSearch - -// MetadataBM25Search - -response = await movies.query.bm25('history', { - limit: 5, - returnMetadata: ['score'] -}) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year - console.log(`${item.properties.title}: ${item.properties.release_date.getUTCFullYear()} `) - // Print the BM25 score of the object from the query - console.log(`BM25 score: ${item.metadata.score}`) -} -// END MetadataBM25Search - - -// MetadataHybridSearch - -response = await movies.query.hybrid('history', { - limit: 5, - returnMetadata: ['score'] -}) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year - - console.log(`${item.properties.title}: ${item.properties.release_date.getUTCFullYear()} `) - // Print the hybrid search score of the object from the query - - console.log(`Hybrid score: ${item.metadata.score}`) -} -// END MetadataHybridSearch - - -// FilteredSemanticSearch - -// Perform query -response = await movies.query.nearText('dystopian future', { - limit: 5, - returnMetadata: ['distance'], - // highlight-start - filters: movies.filter.byProperty('release_date').greaterThan(new Date('December 17, 1995')) - // highlight-end -}) - -// Inspect the response -for (let item of response.objects) { - // Print the title and release year - console.log(`${item.properties.title}: ${item.properties.release_date.getUTCFullYear()} `) - // Print the distance of the object from the query - console.log(`Distance to query: ${item.metadata.distance}`) -} -client.close() -// END FilteredSemanticSearch diff --git a/docs/academy/js/starter_text_data/_snippets/104_rag.ts b/docs/academy/js/starter_text_data/_snippets/104_rag.ts deleted file mode 100644 index 2a359a754..000000000 --- a/docs/academy/js/starter_text_data/_snippets/104_rag.ts +++ /dev/null @@ -1,72 +0,0 @@ -// # START-ANY -import weaviate, { WeaviateClient } from "weaviate-client" - -let client: WeaviateClient -let response -// # END-ANY - -const weaviateURL = process.env.WEAVIATE_URL as string - const weaviateKey = process.env.WEAVIATE_API_KEY as string - const openaiKey = process.env.OPENAI_API_KEY as string - - // Connect to your Weaviate instance - client = await weaviate.connectToWeaviateCloud(weaviateURL, { - authCredentials: new weaviate.ApiKey(weaviateKey), - headers: { - 'X-OpenAI-Api-Key': openaiKey, // Replace with your inference API key - } - }) - -// # START-ANY - -// Instantiate your client (not shown). e.g.: -// const requestHeaders = {'X-OpenAI-Api-Key': process.env.OPENAI_APIKEY as string,} -// client = weaviate.connectToWeaviateCloud(..., headers: requestHeaders) or -// client = weaviate.connectToLocal(..., headers: requestHeaders) - -// # END-ANY - -// SinglePromptGeneration // GroupedTaskGeneration -// Get the collection -const movies = client.collections.use("Movie") -// END GroupedTaskGeneration // END SinglePromptGeneration - -// SinglePromptGeneration - -// Perform query -response = await movies.generate.nearText("dystopian future", { - // highlight-start - singlePrompt: "Translate this into French: {title}" }, - // highlight-end - { limit: 5 } -) - -// Inspect the response -for (let item of response.objects) { - console.log(`${item.properties.title} - ${item.generated}`) -} -// END SinglePromptGeneration - - -// GroupedTaskGeneration - -// Perform query -response = await movies.generate.nearText("dystopian future", { - // highlight-start - groupedTask: "What do these movies have in common?", - // highlight-end - groupedProperties: ['title', 'overview']}, - { limit: 5 } -) - -// Inspect the response -for (let item of response.objects) { - console.log('Title: ', item.properties.title) // Print the title -} - -// highlight-start -console.log(response.generated) // Print the generated text (the commonalities between them) -// highlight-end - -client.close() -// END GroupedTaskGeneration diff --git a/docs/academy/js/starter_text_data/index.md b/docs/academy/js/starter_text_data/index.md deleted file mode 100644 index d0bdf1b7a..000000000 --- a/docs/academy/js/starter_text_data/index.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: "101T Work with: Text data in JavaScript" -description: "Get started with text data in Weaviate for robust search and data organization in JavaScript." -sidebar_position: 101 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -In this project-based course, you will learn how to work with any text data using Weaviate and a movie dataset. - -You will get hands-on experience on how to store and index text data by meaning, using Weaviate's vectorization capabilities. You will learn how to search through that data using semantic, keyword and hybrid searches, as well as filters. You will also learn how to use Weaviate's retrieval augmented generation (RAG) capabilities to generate outputs based on the retrieved objects. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/10_set_up_python.mdx b/docs/academy/py/10_set_up_python.mdx deleted file mode 100644 index 3cc1ba445..000000000 --- a/docs/academy/py/10_set_up_python.mdx +++ /dev/null @@ -1,143 +0,0 @@ ---- -title: 10 Set up Python for Weaviate -sidebar_position: 10 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import PythonCodeExample from '!!raw-loader!./_snippets/10_setup_python.py'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import WeaviatePythonImgUrl from './img/Weaviate-release-1-22-python.png'; - -Follow this short guide to make sure that you are set up to use Weaviate with the Python client. - -Image alt - -## Install Python - -### Is Python installed? - -Open a terminal window (e.g. bash, zsh, Windows PowerShell, Windows Terminal), and run: - -```shell -python --version -``` - -If that did not work, you may need to use `python3` instead of `python`: - -```shell -python3 --version -``` - -If you have Python installed, you should see a response like `Python 3.11.8`. If you have Python 3.8 or higher installed, you can skip the remainder of this section. - -### Install Python - -To install, follow the instructions for your system on [Python.org](https://www.python.org/downloads/). - -Once you have Python installed, check the version again to confirm that you have a recommended version installed. - -:::tip Advanced option: `pyenv` -Another good way to install Python is to install `pyenv`. This will allow you to manage multiple versions of Python on your system. You can find instructions on how to install `pyenv` [here](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation). -::: - -## Set up a virtual environment - -A virtual environment allows you to isolate various Python projects from each other. This is useful because it allows you to install dependencies for each project without affecting the others. - -### Create a virtual environment - -We recommend using `venv` to create a virtual environment. Navigate to your project directory (e.g. `cd PATH/TO/PROJECT`), and run: - -```shell -python -m venv .venv -``` - -Or, if `python3` is your Python command: - -```shell -python3 -m venv .venv -``` - -This will create a virtual environment in a directory called `.venv` in your project directory. - -### Activate the virtual environment - -Each virtual environment can be 'activated' and 'deactivated'. When activated, the Python commands you run will use the Python version and libraries installed in the virtual environment. - -To activate the virtual environment, go to your project directory and run: - -```shell -source .venv/bin/activate -``` - -Or, if you are using Windows: - - - - -```shell -.venv\Scripts\activate.bat -``` - - - - -```shell -.venv\Scripts\Activate.ps1 -``` - - - - -You can check if the virtual environment is activated by running: - -```shell -which python -``` - -Or, if you are using Windows, run `Get-Command python` (PowerShell) or `where python` (Command Prompt). - -If the virtual environment is activated, you should see a path that points to the `.venv` directory. - -:::tip Virtual environments -Virtual environments are very useful. If you would like to learn more, try this tutorial on [FreeCodeCamp](https://www.freecodecamp.org/news/how-to-setup-virtual-environments-in-python/) or this article on [RealPython](https://realpython.com/python-virtual-environments-a-primer/), which goes a little more in-depth. -
- -Additionally, there are many other environment management tools available, such as `conda`, `pipenv`, and `poetry`. If you are already using one of these tools, you can use them instead of `venv`. -::: - -## Install the Weaviate client - -Now, you can install the [Weaviate client library](../../weaviate/client-libraries/index.mdx), which will make it much easier to interact with Weaviate using Python. - -[Activate your virtual environment](#-activate-the-virtual-environment), then install the Weaviate client with: - -```shell -pip install weaviate-client -``` - -### Confirm the installation - -To confirm that the Weaviate client is installed, run the following Python code: - - - -You should see an output like: - -```text -Your Weaviate client library version is: 4.5.4. -``` - -Congratulations, you are now set up to use Weaviate with Python and the Weaviate Python client library! - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/_category_.json b/docs/academy/py/_category_.json deleted file mode 100644 index 5abb7a5e5..000000000 --- a/docs/academy/py/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "Python", - "position": 10 -} \ No newline at end of file diff --git a/docs/academy/py/_snippets/10_setup_python.py b/docs/academy/py/_snippets/10_setup_python.py deleted file mode 100644 index 157cc9be5..000000000 --- a/docs/academy/py/_snippets/10_setup_python.py +++ /dev/null @@ -1,5 +0,0 @@ -# CheckClientLibInstall -import weaviate - -print(f"Your Weaviate client library version is: {weaviate.__version__}.") -# END CheckClientLibInstall diff --git a/docs/academy/py/_snippets/cta_socials.mdx b/docs/academy/py/_snippets/cta_socials.mdx deleted file mode 100644 index 76c75bbb0..000000000 --- a/docs/academy/py/_snippets/cta_socials.mdx +++ /dev/null @@ -1,3 +0,0 @@ -## Stay in touch! - -We are constantly improving our documentation, so please keep an eye out for new resources and updates, by signing up for our [newsletter](https://newsletter.weaviate.io/) or following us on social media ([Twitter](https://x.com/weaviate_io), [LinkedIn](https://www.linkedin.com/company/weaviate-io/)). \ No newline at end of file diff --git a/docs/academy/py/_snippets/intro_next_steps.mdx b/docs/academy/py/_snippets/intro_next_steps.mdx deleted file mode 100644 index df3f36080..000000000 --- a/docs/academy/py/_snippets/intro_next_steps.mdx +++ /dev/null @@ -1,32 +0,0 @@ -Congratulations! You have completed this introductory course on Weaviate. - -Now that you have completed this course, you may be interested in exploring our documentation or the Academy for more advanced courses. - -Some of our more popular resources include: - -### Documentation - -- How-to guides - - The [How-to: Manage collections](/weaviate/manage-collections/index.mdx) and [How-to: Manage objects](/weaviate/manage-objects/index.mdx) guides show how to perform data operations (i.e. create, read, update, delete collections and objects within them).. - - [How-to: search](/weaviate/search/index.mdx): Code examples for all types of search operations. - - [How-to: configure Weaviate](/weaviate/configuration/index.mdx): Guides for configuring Weaviate, such as [PQ](/weaviate/configuration/compression/pq-compression.md) and [BQ](/weaviate/configuration/compression/bq-compression.md) compression, [backups](/deploy/configuration/backups.md) and [replication](/deploy/configuration/replication.md). -- [Concepts guides](/weaviate/concepts/index.md): Guides for understanding Weaviate's architecture and concepts. -- [API reference](/weaviate/api/index.mdx): Detailed information about Weaviate's APIs. - -### Academy - -- [Named vectors](../named_vectors/index.md): Learn how to use named vectors to flexibly represent data in Weaviate. -- [Which search is right for me?](../standalone/which_search/index.mdx): Learn about the different types of searches in Weaviate and when to use them. -- [Chunking](../standalone/chunking/index.mdx): Learn how to use chunking to optimize your search for longer documents. - -import CTASocials from './cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/compression/100_pq.mdx b/docs/academy/py/compression/100_pq.mdx deleted file mode 100644 index 8d75c95bf..000000000 --- a/docs/academy/py/compression/100_pq.mdx +++ /dev/null @@ -1,82 +0,0 @@ ---- -title: Product quantization ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_pq.py'; - - -[Product quantization](/weaviate/concepts/vector-quantization#product-quantization) (PQ), is a technique used to compress vectors. In Weaviate, it can be used to reduce the size of the in-memory [HNSW index](/weaviate/concepts/indexing/vector-index.md#hierarchical-navigable-small-world-hnsw-index), which can improve performance and reduce resource requirements as well as costs. - -## What is product quantization? - -Product quantization compresses vectors in two ways. One, by dividing them into "segments", and two, quantizing each segment to be represented by one of a "codebook" of centroids. - -In the diagram below, we show a vector of L dimensions, where each dimension is a floating point number. The vector is divided into L/x segments, and each segment is quantized to be represented by one of N centroids. - -import PQDiagram from './_snippets/pq_diagram.png'; - -Abstracted PQ diagram showing reduction of dimensions and quantization of groups of floats - -As an example, a 768-dimensional vector can be compressed into a 128-segment quantized vector of 1-byte integers. - -This will reduce the length of the vector by a factor of 6, and also the size of each segment from a floating point number (4 bytes) to a byte, representing one of 256 centroids. - -As a result, the size of the vector is reduced by 24 (from 768 lots of 4 byte numbers to 128 lots of 1 byte numbers). - -The HNSW index can then be built on the PQ-compressed vectors, which will reduce the size of the index in memory. - -### Lossiness - -PQ is a lossy compression technique, as the original floating point numbers are quantized to a smaller set of integers. This means that some information is lost in the compression process. - -However, Weaviate compensates for this by overfetching vectors from the index, and then rescoring the vectors in the uncompressed space. In practice, we find that this compensates quite well for the lossiness of PQ. - -## Configure PQ - -This example creates a collection with product quantization (PQ) enabled, using default settings. - - - -### Explain the code - -This will create a collection with PQ enabled, using the default settings. - -But it is important to note that the compression does not occur immediately. PQ relies on quantization of the vectors, so it is configured to wait until there are sufficient vectors to reach a "training set", by default 100,000 vectors. - -The training set is used to calculate the centroids for the quantization. Once the training set is reached, the PQ compression will occur. - -:::note Version and configuration requirements -This type of PQ configuration is called "AutoPQ", and is available in Weaviate `v1.23` or later, with asynchronous indexing enabled. -
- -If you are using an earlier version of Weaviate, or have asynchronous indexing disabled, you will need to use a different configuration. Please refer to the [PQ configuration documentation](/weaviate/configuration/compression/pq-compression.md#manually-configure-pq) for more information. -::: - -## Customize PQ - -Many PQ parameters are configurable. While the default settings are suitable for many use cases, you may want to customize the PQ configuration to suit your specific requirements. - -The example below shows how to configure PQ with custom settings, such as with a lower training set size, and a different number of centroids. - - - -Please refer to the [PQ configuration documentation](/weaviate/configuration/compression/pq-compression.md#pq-parameters) for more information on the available settings. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/compression/200_bq.mdx b/docs/academy/py/compression/200_bq.mdx deleted file mode 100644 index dad13fae7..000000000 --- a/docs/academy/py/compression/200_bq.mdx +++ /dev/null @@ -1,68 +0,0 @@ ---- -title: Binary quantization ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/200_bq.py'; - - -[Binary quantization](/weaviate/concepts/vector-quantization#binary-quantization) (BQ), is a technique used to compress vectors. In Weaviate, it can be used to reduce the size of the in-memory [HNSW index](/weaviate/concepts/indexing/vector-index.md#hierarchical-navigable-small-world-hnsw-index) or the disk-based [flat index](/weaviate/concepts/indexing/vector-index.md#flat-index). - -For HNSW, BQ can decrease its memory footprint and thus improve performance and reduce resource requirements as well as costs. For the flat index, BQ can reduce the size of the index on disk, which can improve performance. - -## What is binary quantization? - -Binary quantization compresses vectors by reducing each dimension to a single bit, either 0 or 1. - -In other words, a n-dimensional vector composed of n floating point numbers is compressed to a n-dimensional vector composed of n bits. - -This will reduce the size of the vector by a factor of 32 (from 32 bits per float to 1 bit per dimension). - -### Model suitability - -BQ is a relatively simple algorithm, but can perform well in the right circumstances. It is particularly suitable for high-dimensional vectors, where even with BQ, the vector can retain a high degree of information. - -We suggest using BQ for vectors that have been designed for, or been shown to perform well with, binary quantization. Anecdotally, we have seen encouraging recall with Cohere's V3 models (e.g. `embed-multilingual-v3.0` or `embed-english-v3.0`), and OpenAI's `ada-002` and larger `text-embedding-3` models work well with BQ enabled. - -### Lossiness - -BQ is a lossy compression technique, as the original floating point numbers are quantized a bit. - -Weaviate compensates for this by overfetching vectors from the index, and then rescoring the vectors in the uncompressed space. In practice, we find that this compensates quite well for the lossiness of BQ. - -## Configure BQ - -This example creates a collection with binary quantization (BQ) enabled, using default settings. - - - -### Explain the code - -This will create a collection with BQ enabled, using the default settings. - -With BQ, the compression begins immediately, as there is no need to wait for a training set to be reached. - -## Customize BQ - -Some BQ parameters are configurable. An important one is `rescore_limit`, which is the minimum number of vectors to be fetched from the index before the rescore phase is triggered. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/compression/300_strategy.mdx b/docs/academy/py/compression/300_strategy.mdx deleted file mode 100644 index 7afee970b..000000000 --- a/docs/academy/py/compression/300_strategy.mdx +++ /dev/null @@ -1,43 +0,0 @@ ---- -title: Compression strategy ---- - -Given the choice of PQ, BQ, or no compression, which should you choose? The answer is, it depends. - -PQ and BQ are both lossy compression techniques, and the choice between them depends on your circumstances, your model and the use case. - -## The index type - -PQ is currently only supported for the HNSW index, while BQ is supported for both the HNSW and flat indexes. If you are using the flat index, you will need to use BQ. - -## Model suitability - -PQ is generally a more robust compression technique, as it is fitted on your specific data during the training step. This means that you don't need to worry as much about whether your model is suitable for PQ. - -On the other hand, BQ can be more sensitive to the model, such as its length and whether it is designed for binary quantization. - -If you do not know whether your model is suitable for BQ, we recommend using PQ. - -## Tunability - -As you've seen, PQ parameters are tunable whereas BQ is not. This means that you can adjust PQ to be more or less aggressive on performance parameters, such as recall and QPS targets, while still benefiting from some compression. - -## Complexity - -If you are looking for the easiest solution to implement, BQ is the way to go. It is a simple configuration that can be enabled immediately, without the need to wait for a training set to be reached. - -## Conclusion - -In summary, your choice of compression technique depends on your circumstances, your model and the use case. - -But as a general rule of thumb, if you are not sure which to choose, we recommend using PQ. It is more robust, tunable, and generally more suitable for a wider range of models and use cases. - -And if resource constraints are not a concern, you can always choose to use no compression at all. This will give you the best performance, but at the cost of increased resource requirements. - -But do note that you will likely not be able to switch on compression later, as it requires a reindexing of the data. (With an exception of PQ, which [may be enabled later](../../../weaviate/configuration/compression/pq-compression.md#3-enable-pq-and-create-the-codebook) unless your dataset is too large.) - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/py/compression/900_next_steps.mdx b/docs/academy/py/compression/900_next_steps.mdx deleted file mode 100644 index 1991bf250..000000000 --- a/docs/academy/py/compression/900_next_steps.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Next steps ---- - -Congratulations! You have completed this course on compression. We hope you found it helpful and informative. - -There are many more resources available to help you continue your learning journey. - -## Documentation - -- [How-to: Configure PQ](/weaviate/configuration/compression/pq-compression.md) -- [How-to: Configure BQ](/weaviate/configuration/compression/bq-compression.md) -- [Concepts: vector indexes](/weaviate/concepts/indexing/vector-index.md): Vector indexes are critical to how Weaviate works, and compression. Read more here. -- [Configuration references: Vector index](/weaviate/config-refs/indexing/vector-index.mdx): Configuration references for the vector index, including compression settings. - -import CTASocials from '../_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/compression/_snippets/100_pq.py b/docs/academy/py/compression/_snippets/100_pq.py deleted file mode 100644 index baef74c62..000000000 --- a/docs/academy/py/compression/_snippets/100_pq.py +++ /dev/null @@ -1,79 +0,0 @@ -import weaviate -# PQBasicConfig # PQCustomConfig -from weaviate.classes.config import Configure, DataType, Property -# END PQBasicConfig # PQCustomConfig -from weaviate.collections.classes.config import PQEncoderType, PQEncoderDistribution -# END PQCustomConfig - -from weaviate.collections.classes.config import PQConfig - -client = weaviate.connect_to_local() - -# PQBasicConfig # PQCustomConfig - -# Client instantiation not shown -collection_name = "PQExampleCollection" - -# END PQBasicConfig # END PQCustomConfig - -client.collections.delete(collection_name) - -# PQBasicConfig -client.collections.create( - name=collection_name, - # END PQBasicConfig - properties=[ - Property(name="title", data_type=DataType.TEXT) - ], - vector_config=Configure.Vectors.text2vec_openai( - # PQBasicConfig - # highlight-start - quantizer=Configure.VectorIndex.Quantizer.pq() - # highlight-end - ), - - # Other configuration not shown -) -# END PQBasicConfig - -# Confirm creation -c = client.collections.use(collection_name) -coll_config = c.config.get() -assert type(coll_config.vector_config["default"].vector_index_config.quantizer) == PQConfig - - -client.collections.delete(collection_name) - -# PQCustomConfig -client.collections.create( - name=collection_name, - # END PQCustomConfig - properties=[ - Property(name="title", data_type=DataType.TEXT) - ], - vector_config=Configure.Vectors.text2vec_openai( - # PQCustomConfig - # highlight-start - quantizer=Configure.VectorIndex.Quantizer.pq( - segments=512, - centroids=256, - training_limit=50000, - encoder_distribution=PQEncoderDistribution.NORMAL, - encoder_type=PQEncoderType.TILE, - ) - # highlight-end - ), - # Other configuration not shown -) -# END PQCustomConfig - -c = client.collections.use(collection_name) -coll_config = c.config.get() -assert type(coll_config.vector_config["default"].vector_index_config.quantizer) == PQConfig -assert coll_config.vector_config["default"].vector_index_config.quantizer.segments == 512 -assert coll_config.vector_config["default"].vector_index_config.quantizer.training_limit == 50000 - -# START-ANY - -client.close() -# END-ANY diff --git a/docs/academy/py/compression/_snippets/200_bq.py b/docs/academy/py/compression/_snippets/200_bq.py deleted file mode 100644 index 60f9198cd..000000000 --- a/docs/academy/py/compression/_snippets/200_bq.py +++ /dev/null @@ -1,76 +0,0 @@ -import weaviate -# BQBasicConfig # BQCustomConfig -from weaviate.classes.config import Configure, DataType, Property -# END BQBasicConfig # BQCustomConfig - -from weaviate.collections.classes.config import BQConfig - -client = weaviate.connect_to_local() - -print(client.get_meta()) -print(weaviate.__version__) - -# BQBasicConfig # BQCustomConfig - -# Client instantiation not shown -collection_name = "BQExampleCollection" - -# END BQBasicConfig # END BQCustomConfig - -client.collections.delete(collection_name) - -# BQBasicConfig -client.collections.create( - name=collection_name, - # END BQBasicConfig - properties=[ - Property(name="title", data_type=DataType.TEXT) - ], - vector_config=Configure.Vectors.text2vec_openai( - # BQBasicConfig - # highlight-start - quantizer=Configure.VectorIndex.Quantizer.bq() - # highlight-end - ), - # Other configuration not shown -) -# END BQBasicConfig - - -# Confirm creation -c = client.collections.use(collection_name) -coll_config = c.config.get() -assert type(coll_config.vector_config["default"].vector_index_config.quantizer) == BQConfig - - -client.collections.delete(collection_name) - -# BQCustomConfig -client.collections.create( - name=collection_name, - # END BQCustomConfig - properties=[ - Property(name="title", data_type=DataType.TEXT) - ], - vector_config=Configure.Vectors.text2vec_openai( - # BQCustomConfig - # Other configuration not shown - # highlight-start - quantizer=Configure.VectorIndex.Quantizer.bq( - rescore_limit=150 - ) - # highlight-end - ), -) -# END BQCustomConfig - -c = client.collections.use(collection_name) -coll_config = c.config.get() -assert type(coll_config.vector_config["default"].vector_index_config.quantizer) == BQConfig -# assert coll_config.vector_index_config.quantizer.rescore_limit == 150 # appears to be a bug - - -# START-ANY - -client.close() -# END-ANY diff --git a/docs/academy/py/compression/_snippets/pq_diagram.png b/docs/academy/py/compression/_snippets/pq_diagram.png deleted file mode 100644 index 297c18cc7..000000000 Binary files a/docs/academy/py/compression/_snippets/pq_diagram.png and /dev/null differ diff --git a/docs/academy/py/compression/index.md b/docs/academy/py/compression/index.md deleted file mode 100644 index ec8755cef..000000000 --- a/docs/academy/py/compression/index.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: "250 Vector compression" -description: "Discover compression techniques in Weaviate for optimal data storage and retrieval in Python." -sidebar_position: 250 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -:::info Pre-requisites -This course is self-contained. However, we recommend that you go through one of the 101-level courses, such as that for working with [text](../starter_text_data/index.md), [your own vectors](../starter_custom_vectors/index.md), or [multimodal data](../starter_multimodal_data/index.md). -::: - -As you work with more and more data, the sheer volume of it begins to impose further and further constraints on your ability to work with it. This is especially true the closer you get to production environments, where the cost of storage and the time it takes to process data can become significant. - -This course will introduce you to data compression in Weaviate, and how it can be used to reduce your resource requirements and in turn improve performance or reduce costs. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/img/Weaviate-release-1-22-python.png b/docs/academy/py/img/Weaviate-release-1-22-python.png deleted file mode 100644 index eb1965151..000000000 Binary files a/docs/academy/py/img/Weaviate-release-1-22-python.png and /dev/null differ diff --git a/docs/academy/py/multitenancy/100_overview.mdx b/docs/academy/py/multitenancy/100_overview.mdx deleted file mode 100644 index 138f02b82..000000000 --- a/docs/academy/py/multitenancy/100_overview.mdx +++ /dev/null @@ -1,88 +0,0 @@ ---- -title: "Overview" ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -In Weaviate, multi-tenancy allows a collection to efficiently serve isolated groups of data. Each "tenant" in a multi-tenant collection can only access its own data, while sharing the same data structure and settings. - -![multi-tenancy](./img/multi-tenancy-dark.png#gh-dark-mode-only) -![multi-tenancy](./img/multi-tenancy-light.png#gh-light-mode-only) - -This allows Weaviate to efficiently serve a large number of tenants with minimal overhead. And as you will see later, you can manage individual tenants to balance performance and resource usage. - -
- "Multi-tenancy" in other contexts - -In general, the term "multi-tenancy" refers to a software architecture where a single instance of the software serves multiple "tenants". In that context, each tenant may be a group of users who share common access. - -This is similar to the concept of multi-tenancy in Weaviate, where each tenant is a group of data that is isolated from other tenants. - -
- -## Why use multi-tenancy? - -A typical multi-tenancy use-case is in a software-as-a-service (SaaS) application. In many SaaS applications, each end user or account will have private data that should be not be accessible to anyone else. - -### Example case study - -In this course, we'll learn about multi-tenancy by putting ourselves in the shoes of a developer building an application called `MyPrivateJournal`. - -`MyPrivateJournal` is a SaaS (software-as-a-service) application where users like *Steve*, *Alice* and so on can write and store their journal entries. Each user's entries should be private and not accessible to anyone else. - -Using single-tenant collections, you might implement this with: - -1. **A monolithic collection**: To store the entire dataset, with an end user identifier property -1. **Per end-user collections**: Where each end user's data would be in a separate collection - -While these may work to some extent, both of these options have significant limitations. - -- Using a monolithic collection: - - A developer mistake could easily expose Steve's entries to Alice, which would be a significant privacy breach. - - As `MyPrivateJournal` grows, Steve's query would become slower as it must look through the entire collection. - - When Steve asks `MyPrivateJournal` to delete his data, the process would be complex and error-prone. -- Using end-user-specific collections: - - `MyPrivteJournal` may need to spend more on hardware to support the high number of collections. - - Changes to configurations (e.g. adding a new property) would need to be run separately for each collection. - -Multi-tenancy in Weaviate solves these problems by providing a way to isolate each user's data while sharing the same configuration. - -### Benefits of multi-tenancy - -In multi-tenant collection, each "tenant" is isolated from each other, while sharing the same set of configurations. This arrangement helps make multi-tenancy far more resource-efficient than using many individual collections. - -A Weaviate node can host more tenants than single-tenant collections. - -It also makes developers' job easier, as there is only one set of collection configurations. The data isolation between tenants eliminates risks of accidental data leakage and makes it easier to manage individual tenants and tenant data. - -#### `MyPrivateJournal` and multi-tenancy - -So, the `MyPrivateJournal` app can use multi-tenancy and store each user's journal entries in a separate tenant. This way, Steve's entries are isolated from Alice's, and vice versa. This isolation makes it easier to manage each user's data and reduces the risk of data leakage. - -As you will see later, `MyPrivateJournal` can also offload inactive users' data to cold storage, reducing the hot (memory) and warm (disk) resource usage of the Weaviate node. - -## Tenants vs collections - -Each multi-tenant collection can have any number of tenants. - -A tenant is very similar to a single-tenant collection. For example: - -| Aspect | Tenant | Single-tenant collection | -| ----- | ----- | ----- | -| Objects | Belong to a tenant | Belong to a collection | -| Vector indexes | Belong to a tenant | Belong to a collection | -| Inverted indexes | Belong to a tenant | Belong to a collection | -| Deletion | Deleting a tenant deletes all tenant data | Deleting a collection deletes all collection data | -| Query | Can search one tenant at a time | Can search one collection at a time | - -But as you will have guessed, there are also differences. We'll cover these in the next sections, as we follow `MyPrivateJournal` implementing multi-tenancy in Weaviate. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/multitenancy/200_setup.mdx b/docs/academy/py/multitenancy/200_setup.mdx deleted file mode 100644 index 6758fa771..000000000 --- a/docs/academy/py/multitenancy/200_setup.mdx +++ /dev/null @@ -1,308 +0,0 @@ ---- -title: Database setup ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -Let's see we can build a Weaviate instance to support `MyPrivateJournal`'s requirements. - -## Project requirements - -As a SaaS application, `MyPrivateJournal` has determined that they need the following features: - -- **Data isolation**: Each user's data should be private. -- **Efficient scalability**: - - As `MyPrivateJournal` grows, it should be able to handle tens or hundreds of thousands users. - - It must be fast for active users, but inactive users should not consume resources. -- **Ease of management**: - - Adding new users should be simple and fast. - - Removing users should be straightforward. -- **Flexibility**: - - It should be efficient for low-volume users as well as high-volume users. -- **Fault tolerance**: - - A node failure should not lead to complete downtime for a user. -- **Developer experience**: - - These features should be easy to implement and maintain. - -After reviewing potential solutions, we determined that Weaviate with multi-tenant collections can meet these challenges. Let's see how we can implement a proof-of-concept (PoC) solution for `MyPrivateJournal`. - -## Weaviate configuration - -As a development setup, we will use a local Weaviate instance with Docker. This will allow us to quickly set up a Weaviate instance for development and testing. - -Here is our `docker-compose.yml` file: - -```yaml ---- -services: - weaviate_anon: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - restart: on-failure:0 - environment: - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ENABLE_API_BASED_MODULES: 'true' - # highlight-start - ASYNC_INDEXING: 'true' - ENABLE_MODULES: 'backup-filesystem,offload-s3' - AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY:-} - AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_KEY:-} - OFFLOAD_S3_BUCKET_AUTO_CREATE: 'true' - # highlight-end - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node1' -... -``` - -
- What about a multi-node setup? - -Great question! As you probably noticed, we are using a single-node setup here for simplicity. -
- -But we can easily extend this to a multi-node setup by adding additional services. This will allow you to scale your Weaviate instance horizontally, and provide fault tolerance with replication. -
- -For example, here is a multi-node setup with three nodes. - -```yaml ---- -services: - weaviate-node-1: # Founding member service name - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - restart: on-failure:0 - ports: - - "8180:8080" - - 50151:50051 - environment: - AUTOSCHEMA_ENABLED: 'false' - QUERY_DEFAULTS_LIMIT: 25 - QUERY_MAXIMUM_RESULTS: 10000 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ASYNC_INDEXING: 'true' - ENABLE_MODULES: 'text2vec-ollama,generative-ollama,backup-filesystem,offload-s3' - ENABLE_API_BASED_MODULES: 'true' - AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY:-} - AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_KEY:-} - OFFLOAD_S3_BUCKET_AUTO_CREATE: 'true' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node1' - CLUSTER_GOSSIP_BIND_PORT: '7100' - CLUSTER_DATA_BIND_PORT: '7101' - weaviate-node-2: # Founding member service name - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - restart: on-failure:0 - ports: - - "8181:8080" - - 50152:50051 - environment: - AUTOSCHEMA_ENABLED: 'false' - QUERY_DEFAULTS_LIMIT: 25 - QUERY_MAXIMUM_RESULTS: 10000 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ASYNC_INDEXING: 'true' - ENABLE_MODULES: 'text2vec-ollama,generative-ollama,backup-filesystem,offload-s3' - ENABLE_API_BASED_MODULES: 'true' - AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY:-} - AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_KEY:-} - OFFLOAD_S3_BUCKET_AUTO_CREATE: 'true' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node2' - CLUSTER_GOSSIP_BIND_PORT: '7102' - CLUSTER_DATA_BIND_PORT: '7103' - CLUSTER_JOIN: 'weaviate-node-1:7100' - weaviate-node-3: # Founding member service name - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - restart: on-failure:0 - ports: - - "8182:8080" - - 50153:50051 - environment: - AUTOSCHEMA_ENABLED: 'false' - QUERY_DEFAULTS_LIMIT: 25 - QUERY_MAXIMUM_RESULTS: 10000 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ASYNC_INDEXING: 'true' - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ENABLE_API_BASED_MODULES: 'true' - AWS_ACCESS_KEY_ID: ${AWS_ACCESS_KEY:-} - AWS_SECRET_ACCESS_KEY: ${AWS_SECRET_KEY:-} - OFFLOAD_S3_BUCKET_AUTO_CREATE: 'true' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node3' - CLUSTER_GOSSIP_BIND_PORT: '7104' - CLUSTER_DATA_BIND_PORT: '7105' - CLUSTER_JOIN: 'weaviate-node-1:7100' -... -``` - -
- -### Configuration highlights - -You may have seen Docker configurations elsewhere ([e.g. Docs](/deploy/installation-guides/docker-installation.md), [Academy](../starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx)). But these highlighted configurations may be new to you: - -- `ASYNC_INDEXING`: This will enable asynchronous indexing. This is useful for high-volume data insertion, and enables us to use the `dynamic` index type, which you will learn about later on. -- `ENABLE_MODULES`: We enable `offload-s3` to demonstrate tenant offloading later on. Offloading helps us to manage inactive users' data efficiently. -- `AWS_ACCESS_KEY_ID` and `AWS_SECRET_ACCESS_KEY`: These are the AWS credentials that Weaviate will use to access the S3 bucket. -- `OFFLOAD_S3_BUCKET_AUTO_CREATE`: This will automatically create the S3 bucket if it does not exist. - -Save the file to `docker-compose.yaml`, and run the following command to start Weaviate: - -```bash -docker compose up -``` - -import OffloadingLimitation from '/_includes/offloading-limitation.mdx'; - - - -### Your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -Now, we are ready to create a collection for `MyPrivateJournal`. - -## Create a collection - -### Enable multi-tenancy - -A collection must be specified as multi-tenant when it is created. So, we enable multi-tenancy in the collection configuration. - - - -We also set `auto_tenant_creation` and `auto_tenant_activation` here to `true`. You'll learn more about these features later on. But here is a brief overview: - -- `auto_tenant_activation`: if `true`, activate any deactivated (`INACTIVE` or `OFFLOADED`) tenants when they are accessed. -- `auto_tenant_creation`: if `true`, automatically create the tenant when an object is inserted against a non-existent tenant. - -You will see these features in action later on. - -
- More about auto_tenant_creation - -:::info Added in `v1.25` -The auto tenant creation feature is available from `v1.25.0` for batch imports, and from `v1.25.2` for single object insertions. -::: - -Enabling `auto_tenant_creation` will cause Weaviate to automatically create the tenant when an object is inserted against a non-existent tenant. -
- -This option is particularly useful for bulk data ingestion, as it removes the need to create the tenant prior to object insertion. Instead, `auto_tenant_creation` will allow the object insertion process to continue without interruption. -
- -A risk of using `auto_tenant_creation` is that an error in the source data will not be caught during import. For example, a source object with erroneously spelt `"TenntOn"` instead of `"TenantOne"` will create a new tenant for `"TenntOne"` instead of raising an error. -
- -The server-side default for `auto_tenant_creation` is `false`. -
- -
- More about auto_tenant_activation - -:::info Added in `v1.25.2` -The auto tenant activation feature is available from `v1.25.2`. -::: - -If `auto_tenant_activation` is enabled, Weaviate will automatically activate any deactivated (`INACTIVE` or `OFFLOADED`) tenants when they are accessed. -
- -This option is particularly useful for scenarios where you have a large number of tenants, but only a subset of them are active at any given time. An example is a SaaS app where some tenants may be unlikely due to their local time zone, or their recent activity level. -
- -By enabling `auto_tenant_activation`, you can safely set those less active users to be inactive, knowing that they will be loaded onto memory once requested. -
- -This can help to reduce the memory footprint of your Weaviate instance, as only the active tenants are loaded into memory. -
- -The server-side default for `auto_tenant_activation` is `false`. - -
- -### Configure vector index - -From what we know about other journal use cases, a majority of users will only have a small number of entries. But, a few of those users may have a large number of entries. - -This is a tricky situation to balance. If we use a `hnsw` index, it will be fast for users with many entries, but it will require a lot of memory. If we use a `flat` index, it will require less memory, but potentially slower for users with many entries. - -What we can do here is to choose a `dynamic` index. A `dynamic` index will automatically switch from `flat` to `hnsw` once it passes a threshold count. This way, we can balance the memory usage and speed for our users. - -Here is an example code snippet, configuring a "note" named vector with a `dynamic` index. - - - -Note (no pun intended) that the `dynamic` index configuration accepts both `flat` and `hnsw` index configurations. Each index configuration is used when the `dynamic` index is in that state. - -### Full code snippet - -Here is the complete code snippet to create the collection. Take a look at the configuration and see if you agree with our choices. - -In our PoC, we create just the one collection (`"JournalEntry"`), with `"text"`, `"date"`, and `"tags"` properties to keep things simple. - - - -### Summary - -In this section, we set up a Weaviate instance with multi-tenancy enabled. We also created a collection for `MyPrivateJournal`, with multi-tenancy and a dynamic index configuration. - -Now, the `MyPrivateJournal` team can start onboarding users and storing their journal entries in the collection. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/multitenancy/300_tenant_data.mdx b/docs/academy/py/multitenancy/300_tenant_data.mdx deleted file mode 100644 index d5a753fe0..000000000 --- a/docs/academy/py/multitenancy/300_tenant_data.mdx +++ /dev/null @@ -1,171 +0,0 @@ ---- -title: Work with tenants & data ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -The team at `MyPrivateJournal` has now set up a Weaviate instance with a multi-tenant collection. In this section, we will learn how to work with tenants and their data. - -## Add tenants and data - -Once a multi-tenant collection is created, you can manage its tenants through the `.tenant` namespace. - -Each end user in `MyPrivateJournal` will have their journal entries be backed their own tenant. This means that when a user is onboarded, a tenant is created for them. When a user is offboarded, their tenant is removed. - -### Create tenants - -When a user is onboarded, we can include this code to the onboarding logic. - - - -Tenant creation can be done in bulk, too. - -Let's say that `MyPrivateJournal` has a batch of new users to onboard, from an organization called `AcmeCorp`. They could create tenants for multiple users in one operation: - - - -:::caution Tenant names are case-sensitive -Tenant names are case-sensitive. This means that `steve`, `Steve` and `STEVE` are considered different tenants. -::: - -### Add data objects - -Working with a tenant in a multi-tenant collection is similar to working with a single-tenant collection. - -Let's see how `MyPrivateJournal` can work with data objects in a tenant as - -#### Single object insertion - -As an end user (e.g. Steve or `steve85`) writes a new journal entry, `MyPrivateJournal` can insert it into Steve's tenant. - - - -#### Batch data insertion - -And if Steve wants to import his journal entries from another system, `MyPrivateJournal` can insert them in a batch. - -
- Data being inserted - -The objects to be added can be a list of dictionaries, as shown here. Note the use of `datetime` objects with a timezone for `DATE` type properties. - - - -
- - - -#### Bonus: Auto tenant creation - -If `MyPrivateJournal` has enabled auto-tenant creation, they can insert data without explicitly creating a tenant. In this example, the specified tenant does not exist, but Weaviate will create it automatically. - -This allows `MyPrivateJournal` to delay creating a tenant until the user writes their first journal entry. - - - -## Query tenant data - -Once Steve's tenant has been created and populated with data, `MyPrivateJournal` can allow Steve to interact with his data. - -For an application like this, `MyPrivateJournal` might allow Steve to: - -- Retrieve his journal entries for a date range, and -- Search for entries - -The good news is that just like our data operations, retrieving and querying data in a tenant is very similar to working with a single-tenant collection. - -### Retrieve entries for a date range - -To retrieve Steve's journal entries for a specific date range, `MyPrivateJournal` can use a query like this: - - - -
- Example response - -Such a query should return a response like: - - -
- -### Search for entries - -Additionally, Steve might want to search for entries. For example - he might want to search for entries relating to some food experience that he had. - -`MyPrivateJournal` can leverage Weaviate's `hybrid` search to help Steve find the most relevant entries. - - - -
- Example response - - -
- -You can see that the search syntax is essentially identical to that of a single-tenant collection. So, any search method available for a single-tenant collection can be applied to a multi-tenant collection. - -### Summary - -In this section, we learned how to work with tenants and their data in a multi-tenant collection. We saw how to create tenants, add data objects, and query tenant data. - -In the next section, we will learn how `MyPrivateJournal` can keep their application running smoothly and efficiently by managing tenants. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/multitenancy/400_manage_tenants.mdx b/docs/academy/py/multitenancy/400_manage_tenants.mdx deleted file mode 100644 index 00fd82d95..000000000 --- a/docs/academy/py/multitenancy/400_manage_tenants.mdx +++ /dev/null @@ -1,200 +0,0 @@ ---- -title: Manage tenants ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -## Tenant activity statuses - -As `MyPrivateJournal` grows the team notices that the resource requirements and associated costs are growing. - -Meanwhile, the team also notices these usage patterns: - -- Many users use the application for small parts of the day, and then log off. -- Some users are infrequent, accessing the service only occasionally. -- Some users drop off after a while, and their data is not accessed for a long time. - -Weaviate's multi-tenancy features can help `MyPrivateJournal` manage these usage patterns more effectively. Let's see how: - -### Introduction to tenant activity statuses - -But first, let's talk a little about tenant activity statuses. Up to now, you've worked with tenants without specifying an activity status. This creates an `ACTIVE` tenant by default. - -But a tenant can be set to have an `ACTIVE`, `INACTIVE`, or `OFFLOADED` status. - -| Tenant state | CRUD & Queries | Vector Index | Inverted Index | Object Data | Time to Activate | Tenant description | -|------------------|----------------|--------------|----------------|-------------|------------------|------------| -| Active (default) | **Yes** | Hot/Warm | Warm | Warm | None | Available for use | -| Inactive | **No** | Warm | Warm | Warm | Fast | Not available for use, stored locally | -| Offloaded | **No** | Cold | Cold | Cold | Slow | Not available for use, stored on the cloud | - -These states are mutable, letting you manage your tenants more effectively, balancing cost, availability and performance. - -### Manage tenant activity statuses - -So, you might start to see how `MyPrivateJournal` can leverage tenant activity statues to improve their operational efficiency. - -#### Deactivate a tenant - -Since `INACTIVE` tenants use less resources than `ACTIVE` tenants, `MyPrivateJournal` can deactivate tenants that are unlikely to be immediately accessed. - -One example is a user who's just logged out of the application. We could set their tenant to `INACTIVE` as a part of the logout process: - - - -#### Deactivate multiple tenants - -`MyPrivateJournal` can also deactivate multiple tenants at once. For example, they could deactivate all tenants for their local night time, or those who have not logged on for over 7 days: - - - -#### Offload tenants - -`MyPrivateJournal` can also offload tenants to the cloud. This is useful for tenants that are unlikely to be accessed in the near future. - -For example, they could offload tenants that have not been accessed for over 30 days: - - - -This will move the tenant's data to cold storage, freeing up hot and warm resources. In turn, the overall system requirements and cost will be lowered. - -
- How to set up offloading - -import OffloadingLimitation from '/_includes/offloading-limitation.mdx'; - - - -The ability to offload tenants to cold storage is a powerful feature that can help you manage your Weaviate instance's resource usage. -
- -To use tenant offloading in Weaviate, you need enable a relevant offloading [module](../../../weaviate/configuration/modules.md). Depending on whether your deployment is on Docker or Kubernetes, you can enable the `offload-s3` module as shown below. - - - - -```yaml -services: - weaviate: - environment: - # highlight-start - ENABLE_MODULES: 'offload-s3' # plus other modules you may need - OFFLOAD_S3_BUCKET: 'weaviate-offload' # the name of the S3 bucket - OFFLOAD_S3_BUCKET_AUTO_CREATE: 'true' # create the bucket if it does not exist - # highlight-end -``` - - - - -```yaml -offload: - s3: - enabled: true # Set this value to true to enable the offload-s3 module - envconfig: - OFFLOAD_S3_BUCKET: weaviate-offload # the name of the S3 bucket - OFFLOAD_S3_BUCKET_AUTO_CREATE: true # create the bucket if it does not exist -``` - - - - -If the target S3 bucket does not exist, the `OFFLOAD_S3_BUCKET_AUTO_CREATE` variable must be set to `true` so that Weaviate can create the bucket automatically. -
- -#### AWS permissions - -You must provide Weaviate with AWS authentication details. You can choose between access-key or ARN-based authentication. -
- -:::tip Requirements -The Weaviate instance must have the [necessary permissions to access the S3 bucket](https://docs.aws.amazon.com/AmazonS3/latest/userguide/access-policy-language-overview.html). -- The provided AWS identity must be able to write to the bucket. -- If `OFFLOAD_S3_BUCKET_AUTO_CREATE` is set to `true`, the AWS identity must have permission to create the bucket. -::: - -**Option 1: With IAM and ARN roles** -
- -The backup module will first try to authenticate itself using AWS IAM. If the authentication fails then it will try to authenticate with `Option 2`. -
- -**Option 2: With access key and secret access key** -
- -| Environment variable | Description | -| --- | --- | -| `AWS_ACCESS_KEY_ID` | The id of the AWS access key for the desired account. | -| `AWS_SECRET_ACCESS_KEY` | The secret AWS access key for the desired account. | -| `AWS_REGION` | (Optional) The AWS Region. If not provided, the module will try to parse `AWS_DEFAULT_REGION`. | - -Once the `offload-s3` module is enabled, you can offload tenants to the S3 bucket by [setting their activity status](#offload-tenants) to `OFFLOADED`, or load them back to local storage by setting their status to `ACTIVE` or `INACTIVE`. - -
- -#### Activate users - -And then, `MyPrivateJournal` can activate tenants as required. For example, they could activate a tenant when the user logs in, or based on their local time in an inverse pattern to deactivation: - - - -#### Leverage auto-activation - -There may also be cases where a user attempts to perform a query on an `INACTIVE` tenant. - -This may sound like a problem, but Weaviate can automatically activate the tenant for the query, if the collection was created with `auto_tenant_activation` enabled (which [we did do earlier](./200_setup.mdx#-create-a-collection)). - -## Offboard users - -As with all SaaS applications, `MyPrivateJournal` will need to offboard users from time to time. This could be due to a user request, or any other reason for account deletion. - -In a multi-tenant collection, offboarding a user can easily be done by removing their tenant. This will delete the tenant and all its data. - - - -The `MyPrivateJournal` engineers could set up its offboarding system to remove a user's tenant when they delete their account or request data deletion. This will ensure that the user's data is removed from the system. - -### Summary - -In this section, you learned how to manage tenants in a multi-tenant collection. You saw how to: - -- Update a tenant's activity status -- Offload tenants to cold storage -- Activate tenants as required -- Remove tenants from the system - -These features can help `MyPrivateJournal` manage its resource usage more effectively, and provide a better experience for its users. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/multitenancy/900_next_steps.mdx b/docs/academy/py/multitenancy/900_next_steps.mdx deleted file mode 100644 index a1ab47b42..000000000 --- a/docs/academy/py/multitenancy/900_next_steps.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: Wrap-up / Next steps ---- - -Congratulations! You have completed this course on multi-tenancy. We hope you found it helpful and informative. - -## Multi-tenant collections - -Multi-tenant collections are a powerful feature in Weaviate that allow you to manage multiple tenants in a single Weaviate instance. Each tenant has its own isolated data, while sharing the same schema and configuration. - -A multi-tenant collection is also much more efficient than creating multiple Weaviate collections. This allows you to scale your application more easily and cost-effectively. - -## Multi-tenancy and resource management - -While each tenant belongs to a collection and shares the same settings, you can still manage each tenant's resources individually. - -This can be done by setting the tenant's state (to `ACTIVE`, `INACTIVE`, or `OFFLOADED`), or setting the index type as `dynamic`, which allows each tenant to only use HNSW once they pass a certain threshold of objects. - -A carefully constructed strategy to manage the resources of each tenant can help you optimize your application's performance and cost-effectiveness. - -## Further resources - -These resources will help you continue your learning journey: - -- [Starter guide: Resource management](/weaviate/starter-guides/managing-resources/index.md) -- [Concepts: Multi-tenancy](/weaviate/concepts/data.md#multi-tenancy) -- [How-to: Manage collections](../../../weaviate/manage-collections/index.mdx) -- [How-to: Multi-tenant operations](/weaviate/manage-collections/multi-tenancy.mdx) -- [How-to: Manage tenant states](/weaviate/manage-collections/tenant-states.mdx) -- [Weaviate Academy: Vector indexes](../vector_index/index.md) - -import CTASocials from '../_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/multitenancy/_snippets/100_config.py b/docs/academy/py/multitenancy/_snippets/100_config.py deleted file mode 100644 index 327f91870..000000000 --- a/docs/academy/py/multitenancy/_snippets/100_config.py +++ /dev/null @@ -1,355 +0,0 @@ -# FullBasicMT -import weaviate -import os -from weaviate.classes.config import Configure, Property, DataType - -client = weaviate.connect_to_local( - headers={ - "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY"), - } -) - -mt_collection_name = "JournalEntry" - -# END FullBasicMT - -client.collections.delete(mt_collection_name) - -# ================================================================================ -# MyPrivateJournal multi-tenancy configuration -# ================================================================================ - -client.close() - -client = weaviate.connect_to_local( - headers={ - "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY"), - } -) - -client.collections.delete(mt_collection_name) - -# MTConfig # MTFullCollectionCreation -from weaviate.classes.config import Configure, Property, DataType - -mt_collection = client.collections.create( - name=mt_collection_name, # e.g. "JournalEntry" - # highlight-start - multi_tenancy_config=Configure.multi_tenancy( - enabled=True, - auto_tenant_creation=True, - auto_tenant_activation=True, - ), - # highlight-end - # END MTConfig # MTFullCollectionCreation - properties=[ - Property(name="text", data_type=DataType.TEXT), - Property(name="date", data_type=DataType.DATE), - Property(name="tags", data_type=DataType.TEXT_ARRAY), - ], - # DynamicIndexConfig # MTFullCollectionCreation - vector_config=[ - Configure.Vectors.text2vec_cohere( - name="text", - source_properties=["text"], - # highlight-start - vector_index_config=Configure.VectorIndex.dynamic( - hnsw=Configure.VectorIndex.hnsw( - quantizer=Configure.VectorIndex.Quantizer.sq(training_limit=50000) - ), - flat=Configure.VectorIndex.flat( - quantizer=Configure.VectorIndex.Quantizer.bq() - ), - threshold=10000 - ) - # highlight-end - ) - ], - # END DynamicIndexConfig # END MTVectorIndexConfig # MTFullCollectionCreation - generative_config=Configure.Generative.cohere(model="command-r-plus") - # MTConfig -) -# END MTConfig # END MTFullCollectionCreation - -# ================================================================================ -# Basic tenant creation -# ================================================================================ - -# BasicTenantCreation -# highlight-start -mt_collection.tenants.create("steve85") # Create a tenant, e.g. based on a username -# highlight-end -# END BasicTenantCreation - -# MultiTenantCreation -new_usernames = ["bob1", "alice2", "etienne3"] - -# highlight-start -mt_collection.tenants.create(new_usernames) # Create multiple tenants -# highlight-end -# END MultiTenantCreation - -# ================================================================================ -# Single object insertion -# ================================================================================ - -# SingleObjectInsertion -from datetime import datetime, timezone - -# Start with the collection object and specify the tenant -tenant = mt_collection.with_tenant("steve85") - -tenant.data.insert( - properties={ - "text": "What amazing food we had at Quay! It was totally worth it.", - "date": datetime(2024, 5, 15).replace(tzinfo=timezone.utc), - "tags": ["restaurant", "experience"], - } -) -# END SingleObjectInsertion - -# ================================================================================ -# Batch data insertion -# ================================================================================ - -# BatchDataToInsert -from datetime import datetime, timezone - -journal_entries = [ - { - "text": "The Top Gun sequel was amazing!", - "date": datetime(2022, 5, 30).replace(tzinfo=timezone.utc), - "tags": ["movie", "action"], - }, - { - "text": "Ahhh the Taylor Swift Eras concert in Denver was sooo much better than I could have hoped for!", - "date": datetime(2023, 7, 14).replace(tzinfo=timezone.utc), - "tags": ["music", "swifties", "concert"], - }, - { - "text": "After watching Kate McKinnon play Weird Barbie I totally feel seen.", - "date": datetime(2023, 7, 25).replace(tzinfo=timezone.utc), - "tags": ["movie", "barbie", "happy"], - }, - { - "text": "Spring is here and I'm loving the new flowers in the garden!", - "date": datetime(2024, 4, 5).replace(tzinfo=timezone.utc), - "tags": ["garden", "home"], - }, - { - "text": "I went to a cooking class and learned how to make sushi!", - "date": datetime(2024, 5, 16).replace(tzinfo=timezone.utc), - "tags": ["cooking", "hobby"], - }, - { - "text": "The new taco place in town is amazing!", - "date": datetime(2024, 7, 20).replace(tzinfo=timezone.utc), - "tags": ["food", "restaurant"], - }, -] -# END BatchDataToInsert - -# BasicBatchInsertion -tenant = mt_collection.with_tenant("steve85") - -with tenant.batch.fixed_size(100) as batch: - for journal_entry in journal_entries: - batch.add_object(journal_entry) -# END BasicBatchInsertion - -# ================================================================================ -# Auto tenant creation -# ================================================================================ - -# AutoTenantCreationAtInsert -nonexistent_tenant = mt_collection.with_tenant("newsteve15") - -nonexistent_tenant.data.insert({ - "date": datetime(2024, 7, 7).replace(tzinfo=timezone.utc), - "tags": ["events", "grand prix"], - "text": "Going to Silverstone was a dream come true!", -}) -# END AutoTenantCreationAtInsert - -# ================================================================================ -# Filtering by date range -# ================================================================================ - -# DateRangeQuery -from weaviate.classes.query import Filter -from datetime import datetime, timezone - -tenant = mt_collection.with_tenant("steve85") - -start_date = datetime(2023, 7, 1).replace(tzinfo=timezone.utc) -end_date = datetime(2023, 7, 31).replace(tzinfo=timezone.utc) - -response = tenant.query.fetch_objects( - filters=( - Filter.by_property("date").greater_or_equal(start_date) & - Filter.by_property("date").less_or_equal(end_date) - ), - limit=10 -) -# END DateRangeQuery - -for obj in response.objects: - print(obj.properties) - -""" -# ExampleResponseDateRange -{ - "text": "Ahhh the Taylor Swift Eras concert in Denver was sooo much better than I could have hoped for!", - "date": datetime.datetime(2023, 7, 14, 0, 0, tzinfo=datetime.timezone.utc), - "tags": ["music", "swifties", "concert"], -} -{ - "text": "After watching Kate McKinnon play Weird Barbie I totally feel seen.", - "date": datetime.datetime(2023, 7, 25, 0, 0, tzinfo=datetime.timezone.utc), - "tags": ["movie", "barbie", "happy"], -} -# END ExampleResponseDateRange -""" - -# ================================================================================ -# User query -# ================================================================================ - -# UserQuery -tenant = mt_collection.with_tenant("steve85") - -response = tenant.query.hybrid( - query="food experience", - limit=2 -) -# END UserQuery - -for obj in response.objects: - print(obj.properties) - -""" -# ExampleResponseUserQuery -{ - "date": datetime.datetime(2024, 5, 15, 0, 0, tzinfo=datetime.timezone.utc), - "tags": ["restaurant", "experience"], - "text": "What amazing food we had at Quay! It was totally worth it.", -} -{ - "date": datetime.datetime(2024, 5, 16, 0, 0, tzinfo=datetime.timezone.utc), - "tags": ["cooking", "hobby"], - "text": "I went to a cooking class and learned how to make sushi!", -} -# END ExampleResponseUserQuery -""" - -# ================================================================================ -# Deactivate a tenant -# ================================================================================ - -mt_collection.tenants.create(["travis1989"]) - -# UpdateOneTenantStatus -from weaviate.classes.tenants import Tenant, TenantActivityStatus - -mt_collection = client.collections.use(mt_collection_name) - -mt_collection.tenants.update( - Tenant(name="travis1989", activity_status=TenantActivityStatus.INACTIVE) -) -# END UpdateOneTenantStatus - - -# ================================================================================ -# Deactivate multiple tenants -# ================================================================================ - -inactive_users = [f"user{100+i}" for i in range(10)] - -# UpdateMultipleTenantStatuses -from weaviate.classes.tenants import Tenant, TenantActivityStatus - -mt_collection = client.collections.use(mt_collection_name) - -tenants_to_deactivate = [ - Tenant(name=user, activity_status=TenantActivityStatus.INACTIVE) - for user in inactive_users -] - -mt_collection.tenants.update(tenants_to_deactivate) -# END UpdateMultipleTenantStatuses - -tenant_names_to_offload = [] # List of tenants to offload - -# OffloadMultipleTenants -from weaviate.classes.tenants import Tenant, TenantActivityStatus - -mt_collection = client.collections.use(mt_collection_name) - -tenants_to_offload = [ - Tenant(name=user, activity_status=TenantActivityStatus.OFFLOADED) - for user in tenant_names_to_offload -] - -mt_collection.tenants.update(tenants_to_offload) -# END OffloadMultipleTenants - -tenant_names_to_activate = [] # List of tenants to offload - -# ActivateMultipleTenants -from weaviate.classes.tenants import Tenant, TenantActivityStatus - -mt_collection = client.collections.use(mt_collection_name) - -tenants_to_activate = [ - Tenant(name=user, activity_status=TenantActivityStatus.ACTIVE) - for user in tenant_names_to_activate -] - -mt_collection.tenants.update(tenants_to_activate) -# END ActivateMultipleTenants - -# ================================================================================ -# Remove tenants -# ================================================================================ - -# RemoveTenants -from weaviate.classes.tenants import Tenant - -mt_collection = client.collections.use(mt_collection_name) - -# Caution - this will remove all of the associated data for the tenants -mt_collection.tenants.remove([ - "depardieu10", - "travis1989", -]) -# END RemoveTenants - -# ================================================================================ -# Misc methods -# ================================================================================ - -# -mt_collection = client.collections.use(mt_collection_name) - -all_tenants = mt_collection.tenants.get() -for k, v in all_tenants.items(): - print(k, v) - -tenants = mt_collection.tenants.get_by_names(["bob1", "alice1"]) -for k, v in tenants.items(): - print(k, v) - -tenant = mt_collection.tenants.get_by_name("bob1") -print(tenant) - -print(mt_collection.tenants.exists("etienne1")) - - - - - - - - - -client.close() diff --git a/docs/academy/py/multitenancy/img/multi-tenancy-dark.png b/docs/academy/py/multitenancy/img/multi-tenancy-dark.png deleted file mode 100644 index 88449c466..000000000 Binary files a/docs/academy/py/multitenancy/img/multi-tenancy-dark.png and /dev/null differ diff --git a/docs/academy/py/multitenancy/img/multi-tenancy-light.png b/docs/academy/py/multitenancy/img/multi-tenancy-light.png deleted file mode 100644 index e2b5036dc..000000000 Binary files a/docs/academy/py/multitenancy/img/multi-tenancy-light.png and /dev/null differ diff --git a/docs/academy/py/multitenancy/index.md b/docs/academy/py/multitenancy/index.md deleted file mode 100644 index b5c80721b..000000000 --- a/docs/academy/py/multitenancy/index.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "280 Multi-tenancy (MT)" -description: "Implement multitenancy in Weaviate for isolated data environments in Python applications." -sidebar_position: 280 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -:::info Pre-requisites -This course is self-contained. However, we recommend that you go through one of the 101-level courses, such as that for working with [text](../starter_text_data/index.md), [your own vectors](../starter_custom_vectors/index.md), or [multimodal data](../starter_multimodal_data/index.md). -::: - -Multi-tenancy allows you to create a Weaviate collection containing a high number of lightweight "tenants". - -Tenants are designed to house isolated, identical data structures. This is suitable for use cases such as software-as-a-service (SaaS) type applications, where each end user's data can be backed by a tenant. Tenants can be managed independently, and their data can be offloaded to cold storage to reduce memory and disk usage. - -This course introduces you to multi-tenancy. It teaches you how to enable and configure a multi-tenant collection in Weaviate, as well as how to work with tenants and tenant data. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/named_vectors/101_nv_preparation/index.mdx b/docs/academy/py/named_vectors/101_nv_preparation/index.mdx deleted file mode 100644 index 1a2a16705..000000000 --- a/docs/academy/py/named_vectors/101_nv_preparation/index.mdx +++ /dev/null @@ -1,164 +0,0 @@ ---- -title: Preparation ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/101_connect.py'; - -:::info Pre-requisites -This course is self-contained. However, we recommend that you go through one of the 101-level courses, such as that for working with [text](../../starter_text_data/index.md), [your own vectors](../../starter_custom_vectors/index.md), or [multimodal data](../../starter_multimodal_data/index.md). -::: - -This page briefly covers the required resources and setup, including the Weaviate Python client library, and a Weaviate instance with the multi-modal vectorizer. - -## Weaviate Python client library - -Install the latest (`v4`, e.g. `4.5.0`) Weaviate Python client library with: - -```bash -pip install -U weaviate-client -``` - -## Set up Weaviate - -Install Docker on your machine. We recommend following the [official Docker installation guide](https://docs.docker.com/get-docker/). - -Create a new directory and navigate to it in your terminal. Then, create a new file called `docker-compose.yml` and add the following content: - -```yaml ---- -services: - weaviate: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - volumes: - - weaviate_data:/var/lib/weaviate - restart: on-failure:0 - environment: - CLIP_INFERENCE_API: 'http://multi2vec-clip:8080' - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ENABLE_MODULES: 'multi2vec-clip' - ENABLE_API_BASED_MODULES: 'true' - CLUSTER_HOSTNAME: 'node1' - multi2vec-clip: - image: cr.weaviate.io/semitechnologies/multi2vec-clip:sentence-transformers-clip-ViT-B-32-multilingual-v1 - environment: - ENABLE_CUDA: '0' -volumes: - weaviate_data: -... - -``` - -### Create a Weaviate instance - -Run the following command to start Weaviate: - -```bash -docker compose up -``` - -### Your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -## Work with Weaviate - -### Connect to your Weaviate instance - -To connect to the Weaviate instance, use the `connect_to_local` function. We also provide API keys here for any inference APIs (e.g. OpenAI, Cohere, Google, AWS etc.) that Weaviate may use. - - - -### Check Weaviate status - -You can check whether the Weaviate instance is up using the `is_live` function. - - - -### Retrieve server meta information - -You can retrieve meta information about the Weaviate instance using the `meta` function. - - - -This will print the server meta information to the console. The output will look similar to the following: - -
- Example get_meta output - -Note that this output is a little longer due to the additional details from the CLIP models. - - -
- -### Close the connection - -After you have finished using the Weaviate client, you should close the connection. This frees up resources and ensures that the connection is properly closed. - -We suggest using a `try`-`finally` block as a best practice. For brevity, we will not include the `try`-`finally` blocks in the remaining code snippets. - - - -## Source data - -We are going to use a movie dataset sourced from [TMDB](https://www.themoviedb.org/). The dataset can be found in this [GitHub repository](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json), and it contains bibliographic information on ~700 movies released between 1990 and 2024. - -As a multimodal project, we'll also use [corresponding posters for each movie](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_posters.zip), which are available in the same repository. - -
- See sample text data - -| | backdrop_path | genre_ids | id | original_language | original_title | overview | popularity | poster_path | release_date | title | video | vote_average | vote_count | -|---:|:---------------------------------|:----------------|-----:|:--------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------:|:---------------------------------|:---------------|:----------------------------|:--------|---------------:|-------------:| -| 0 | /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg | [14, 18, 10749] | 162 | en | Edward Scissorhands | A small suburban town receives a visit from a castaway unfinished science experiment named Edward. | 45.694 | /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg | 1990-12-07 | Edward Scissorhands | False | 7.7 | 12305 | -| 1 | /sw7mordbZxgITU877yTpZCud90M.jpg | [18, 80] | 769 | en | GoodFellas | The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway. | 57.228 | /aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg | 1990-09-12 | GoodFellas | False | 8.5 | 12106 | -| 2 | /6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg | [35, 10751] | 771 | en | Home Alone | Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. But when a pair of bungling burglars set their sights on Kevin's house, the plucky kid stands ready to defend his territory. By planting booby traps galore, adorably mischievous Kevin stands his ground as his frantic mother attempts to race home before Christmas Day. | 3.538 | /onTSipZ8R3bliBdKfPtsDuHTdlL.jpg | 1990-11-16 | Home Alone | False | 7.4 | 10599 | -| 3 | /vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg | [12, 35, 878] | 196 | en | Back to the Future Part III | The final installment of the Back to the Future trilogy finds Marty digging the trusty DeLorean out of a mineshaft and looking for Doc in the Wild West of 1885. But when their time machine breaks down, the travelers are stranded in a land of spurs. More problems arise when Doc falls for pretty schoolteacher Clara Clayton, and Marty tangles with Buford Tannen. | 28.896 | /crzoVQnMzIrRfHtQw0tLBirNfVg.jpg | 1990-05-25 | Back to the Future Part III | False | 7.5 | 9918 | -| 4 | /3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg | [35, 10749] | 114 | en | Pretty Woman | When a millionaire wheeler-dealer enters a business contract with a Hollywood hooker Vivian Ward, he loses his heart in the bargain. | 97.953 | /hVHUfT801LQATGd26VPzhorIYza.jpg | 1990-03-23 | Pretty Woman | False | 7.5 | 7671 | - -
- -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/102_nv_collections/20_create_collection.mdx b/docs/academy/py/named_vectors/102_nv_collections/20_create_collection.mdx deleted file mode 100644 index 49b0853fe..000000000 --- a/docs/academy/py/named_vectors/102_nv_collections/20_create_collection.mdx +++ /dev/null @@ -1,64 +0,0 @@ ---- -title: Create a collection ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -To use named vectors, your collection be configured with named vector definitions. - -## Code - -This example creates a collection for the movie data, including multiple named vector definitions: - - - -## Explain the code - -The key difference here is the use of `NamedVectors` class to define vectorizer configurations. Let's review the code in further detail: - -:::tip Revision -This code builds on the [multimodal](../../starter_multimodal_data/102_mm_collections/20_create_collection.mdx) example. Review that example for further explanations. -::: - -### Named vector configuration - -This definition allows each object to be represented by three vectors, named `title`, `overview` and `poster_title`. - - - -#### `title` - -This vector representation is generated from the `title` property (`source_properties`). The `text2vec-openai` module is used for vectorization. - -You could use this to search for movies by similarities to their titles. - -#### `overview` - -This vector representation is based on the `overview` property. As such, you could use this to search for movies by similarities to their plot or key ideas. - -#### `poster_title` - -This vector representation is generated from a combination of the `title` and `poster` properties. The `multi2vec-clip` module is used for vectorization. - -Note that the majority of the vector weight is given to the `poster` property (90%), and the rest to the `title` property (10%). This means that the vector representation will be more influenced by the poster than the title. - -As this uses a multimodal vectorizer, you could use this to search for movies using any image, or text, by their similarity to the title or poster. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/102_nv_collections/30_import_data.mdx b/docs/academy/py/named_vectors/102_nv_collections/30_import_data.mdx deleted file mode 100644 index 9cab12c38..000000000 --- a/docs/academy/py/named_vectors/102_nv_collections/30_import_data.mdx +++ /dev/null @@ -1,52 +0,0 @@ ---- -title: Import data -description: "Importing Data into Named Vectors Collections" ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -## Code - -This example imports the movie data into our collection. - -Amazingly, the exact same code that we used for single vector configuration [in the multimodal course](../../starter_multimodal_data/102_mm_collections/20_create_collection.mdx) can be used here. This is because the named vector configuration is set up in the collection definition, and Weaviate handles the rest. - - - -The code: -- Loads the source text and image data -- Gets the collection -- Enters a context manager with a batcher (`batch`) object -- Loops through the data and: - - Finds corresponding image to the text - - Converts the image to base64 - - Adds objects to the batcher -- Prints out any import errors - -We won't repeat the explanation of the code here, as it is the same as in the multimodal course. If you would like a refresher, please review the [multimodal course](../../starter_multimodal_data/102_mm_collections/20_create_collection.mdx). - -## Where do the vectors come from? - -When the batcher sends the queue to Weaviate, the objects are added to the collection. In our case, the movie collection. - -In this case, recall that we have three named vectors for each object - `title`, `overview` and `poster_title`. The vectors are generated by the vectorizers that we set up in the collection definition. - -- The `title` vector is generated by the `text2vec-openai` vectorizer -- The `overview` vector is generated by the `text2vec-openai` vectorizer -- The `poster_title` vector is generated by the `multi2vec-clip` vectorizer - -Next, we will explore how these named vectors provide flexibility in searching for our data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/102_nv_collections/index.mdx b/docs/academy/py/named_vectors/102_nv_collections/index.mdx deleted file mode 100644 index a39db460d..000000000 --- a/docs/academy/py/named_vectors/102_nv_collections/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Populate the database -description: Organize named vector collections for optimized search in Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/103_nv_queries/10_searches.mdx b/docs/academy/py/named_vectors/103_nv_queries/10_searches.mdx deleted file mode 100644 index 00c15f87e..000000000 --- a/docs/academy/py/named_vectors/103_nv_queries/10_searches.mdx +++ /dev/null @@ -1,183 +0,0 @@ ---- -title: Searches ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -As collections with named vectors can include multiple vector embeddings, any vector or similarity search must specify a "target" vector. - -This applies for `near_text` and `near_vector` searches, as well as multimodal searches such as `near_image` and so on. Let's explore a few examples here. - -## Text searches - -### Code - -Here, we look for entries in "MovieNVDemo" based on their similarity to the phrase `"A joyful holiday film"`. Note, however, that we show multiple versions of the same query, each with a different `target_vector` parameter: - - - - - - - - - - - - - -### Explain the code - -Each named vector here is based on a different property of the movie data. - -The first search compares the meaning of the movie **title** with the query, the second search compares the **entire summary** (overview) with the query, and the third compares the **poster (and the title)** with the query. - -Weaviate also allows each named vector to be set with a different vectorizer. You will recall that the `poster_title` vector is created by the CLIP models, while the `title` and `overview` properties are created by the OpenAI model. - -As a result, each named vector can be further specialized by using the right model for the right property. - -### Explain the results - -The results of each search are different, as they are based on different properties of the movie data. - -#### `title` vs `overview` - -Note that the search with the `overview` target vector includes titles like "Home Alone" and "Home Alone 2: Lost in New York", which are not included in the other searches. - -This is because the plots of these movies are holiday-themed, even though the titles are not obviously joyful or holiday-related. - -#### `poster` - -The search with `poster_title` target vector interestingly includes "Misery" - the Stephen King horror movie! This is very likely because the poster of the movie is a snowy scene. And since the CLIP vectorizer is trained to identify elements of images, it identifies this terrifying film as a result of the search. - -Given the imagery of the poster only and no other context, you would have to say that the search isn't wrong, even though anyone who's read the book or watched the movie would agree. - - - - -```text -How the Grinch Stole Christmas 2000 8871 -Distance to query: 0.162 - -The Nightmare Before Christmas 1993 9479 -Distance to query: 0.177 - -The Pursuit of Happyness 2006 1402 -Distance to query: 0.182 - -Jingle All the Way 1996 9279 -Distance to query: 0.184 - -Mrs. Doubtfire 1993 788 -Distance to query: 0.189 -``` - - - - -```text -How the Grinch Stole Christmas 2000 8871 -Distance to query: 0.148 - -Home Alone 1990 771 -Distance to query: 0.164 - -Onward 2020 508439 -Distance to query: 0.172 - -Home Alone 2: Lost in New York 1992 772 -Distance to query: 0.175 - -Little Miss Sunshine 2006 773 -Distance to query: 0.176 -``` - - - - -Posters for the top 5 matches: -Life Is Beautiful -Groundhog Day -Jingle All the Way -Training Day -Misery - -```text -Life Is Beautiful 1997 637 -Distance to query: 0.621 - -Groundhog Day 1993 137 -Distance to query: 0.623 - -Jingle All the Way 1996 9279 -Distance to query: 0.625 - -Training Day 2001 2034 -Distance to query: 0.627 - -Misery 1990 1700 -Distance to query: 0.632 -``` - - - - -## Hybrid search - -### Code - -This example finds entries in "MovieNVDemo" with the highest hybrid search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -Hybrid search with named vectors works the same way as other vector searches with named vectors. You must provide a `target_vector` parameter to specify the named vector for the vector search component of the hybrid search. - -## Keyword search - -As named vectors affect the vector representations of objects, they do not affect keyword searches. You can perform keyword searches on named vector collections using the same syntax as you would for any other collections. - -## Named vectors in search - -The use of named vectors enables flexible search options that can be tailored to your needs. - -Each object can have as many named vectors as you would like, with any combinations of properties and vectorizers, or even multiple custom vectors provided by your own models. - -This flexibility allows you to create databases with vector representations that are tailored to your specific use case, and to search for similar items based on any combination of properties. - -## What about RAG? - -RAG, or retrieval augmented generation, queries with named vectors work the same way as with other vector searches with named vectors. You must provide a `target_vector` parameter to specify the named vector for the vector search component of the RAG query. - -This, in turn, can improve the quality of the generation. Let's explore a few examples in the [next section](./20_use_cases.mdx). - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/103_nv_queries/20_use_cases.mdx b/docs/academy/py/named_vectors/103_nv_queries/20_use_cases.mdx deleted file mode 100644 index f874f291b..000000000 --- a/docs/academy/py/named_vectors/103_nv_queries/20_use_cases.mdx +++ /dev/null @@ -1,132 +0,0 @@ ---- -title: Use cases ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_usecase.py'; - -RAG, or retrieval augmented generation, is a powerful feature that combines the strengths of both vector search and language generation. Named vectors can be used in RAG queries to improve workflow and results. - -In this section, we'll explore a few examples of how named vectors allow different users to search and generate results based on their specific needs, using the same collection. - -## Design agency: evaluating a poster design - -Imagine a design agency (*Aesthetico*) that is contracted to work on the poster design for a new movie. - -Aesthetico's designers have arrived at [this film poster design](https://upload.wikimedia.org/wikipedia/commons/7/78/1927_Boris_Bilinski_%281900-1948%29_Plakat_f%C3%BCr_den_Film_Metropolis%2C_Staatliche_Museen_zu_Berlin.jpg). They would now like to see how their poster compares with other movie posters in existence, and what types of movies those posters are for. - -Metropolis poster - -Luckily for them, the `MovieNVDemo` collection has `poster_title` named vectors which is primarily based on the poster design. So Aesthetico's designers can search against the `poster_title` named vector and find movies that are similar to their poster design. And, they can then perform RAG to summarize the movies that are found. - -### Code - -This query will find similar movies to the input image, and then provide insights using RAG. - - - -### Output - -This is an example output of the RAG query: - -> These movies can be categorized as action, science fiction, thriller, and drama. The audience for these movies would likely be fans of action-packed films with elements of suspense, mystery, and fantastical creatures or scenarios. These movies may appeal to a wide range of viewers, including fans of superhero movies, science fiction enthusiasts, and those who enjoy intense and thrilling storylines. - -The designers at Aesthetico could use this to understand the types of movies that are similar to their poster design, and further inform their own design choices. - -
- Search results - -Predator 2
-Inception
-Mission: Impossible
-The Dark Knight
-Lost in Translation
-Independence Day
-Godzilla vs. Kong
-Fargo
-The Amazing Spider-Man
-Godzilla
- -
- -## Film writers: evaluating ideas - -Now, in another project, a set of writers at *ScriptManiacs* are working on a movie script for a science fiction film. They are working a few ideas for the movie title, and they want to see what kinds of imagery and themes are associated with each title. - -They could also use the same collection to do what they want to do. In fact, they could run multiple queries against the same collection, each with a different `target_vector` parameter. - -The ScriptManiacs writers can: -- Search against the `title` named vector to find movies with *similar titles*; -- Search against the `overview` named vector to find movies whose *plots are similar* to their title idea; and - -Let's see how they could do it for a title - "Chrono Tides: The Anomaly Rift". - -### Code - -This example finds entries in "MovieNVDemo" based on their similarity to "Chrono Tides: The Anomaly Rift", then instructs the large language model to find commonalities between them. - -Note the `for tgt_vector` loop, which allows the writers to run the same query against different named vectors. - - - -### Output - -The two queries produced quite different outputs to each other. When we search for titles most similar to "Chrono Tides: The Anomaly Rift", the results skew towards action/adventure films, while searching for an overview most similar to "Chrono Tides: The Anomaly Rift" include science fiction and adventure films. - -Given the different results, the writers at ScriptManiacs could use this to understand the different themes and genres that are associated with their title idea, and further inform their own writing choices. - -For example, if "Chrono Tides: The Anomaly Rift" is intended to be a science fiction film with an action/adventure skew, the title may be a good one. On the other hand, if the writers are looking for a more dramatic or romantic theme, they may need to reconsider the title. - -The results of the overview search include multiple science-fiction and adventure films, indicating that the writes are on the right path of naming a science fiction movie as such. - -#### Similar titles - -According to our RAG query, movies with similar titles to "Chrono Tides: The Anomaly Rift" have the following commonalities: - -> These movies are **action/adventure films** that are likely aimed at a wide audience, including fans of fantasy, adventure, and romance genres. The Pirates of the Caribbean and Lara Croft movies are targeted towards fans of swashbuckling adventures and treasure hunting, while The Croods appeals to families and fans of animated films. The Twilight Saga targets fans of supernatural romance, and Meg 2: The Trench is aimed at fans of underwater thrillers and action movies. Overall, these movies cater to audiences who enjoy high-stakes adventures, fantastical elements, and dramatic storylines. - -
- Search results - -Pirates of the Caribbean: On Stranger Tides
-Lara Croft: Tomb Raider
-The Croods: A New Age
-The Twilight Saga: Breaking Dawn - Part 1
-Meg 2: The Trench
- -
- -#### Similar overviews - -While movies with overviews that are most similar to the search "Chrono Tides: The Anomaly Rift" have the following commonalities: - -> These movies can be categorized as **science fiction and adventure films**. They are aimed at audiences who enjoy stories about space exploration, ancient civilizations, dinosaurs, natural disasters, and mythical adventures. The target audience may include fans of action-packed and visually stunning movies with elements of fantasy and suspense. - -
- Search results - -Stargate
-Interstellar
-Jurassic Park III
-2012
-Moana
- -
- -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/103_nv_queries/index.mdx b/docs/academy/py/named_vectors/103_nv_queries/index.mdx deleted file mode 100644 index c4ff28fa8..000000000 --- a/docs/academy/py/named_vectors/103_nv_queries/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Perform queries -description: Query and organize named vectors seamlessly in Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/900_next_steps.mdx b/docs/academy/py/named_vectors/900_next_steps.mdx deleted file mode 100644 index 7abd9b9b0..000000000 --- a/docs/academy/py/named_vectors/900_next_steps.mdx +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: Next steps ---- - -Congratulations! You have completed this course on named vectors. We hope you found it helpful and informative. - -There are many more resources available to help you continue your learning journey with named vectors. - -## Documentation - -- [Collection definition: named vectors](/weaviate/config-refs/collections#named-vectors): References on how to define named vectors in the collection. -- The [How-to: Manage collections](/weaviate/manage-collections/index.mdx) and [How-to: Manage objects](/weaviate/manage-objects/index.mdx) guides show how to perform data operations (i.e. create, read, update, delete collections and objects within them)., including those with named vectors. -- [How-to: search](/weaviate/search/index.mdx): Code examples for all types of search operations, including those with named vectors. - -import CTASocials from '../_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/named_vectors/_snippets/101_connect.py b/docs/academy/py/named_vectors/_snippets/101_connect.py deleted file mode 100644 index c2d1018de..000000000 --- a/docs/academy/py/named_vectors/_snippets/101_connect.py +++ /dev/null @@ -1,531 +0,0 @@ -# DockerInstantiation -import weaviate - -client = weaviate.connect_to_local() -# END DockerInstantiation - -client.close() - -# DockerAPIKeyInstantiation -import weaviate -import os - -headers = { - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") -} # Replace with your own API keys - -client = weaviate.connect_to_local(headers=headers) -# END DockerAPIKeyInstantiation - -# PollLiveness -assert client.is_live() # This will raise an exception if the client is not live -# END PollLiveness - - -# GetMeta -import json - -metainfo = client.get_meta() -print(json.dumps(metainfo, indent=2)) # Print the meta information in a readable format -# END GetMeta - - -""" -# OutputGetMeta -{ - "hostname": "http://[::]:8080", - "modules": { - "generative-openai": { - "documentationHref": "https://platform.openai.com/docs/api-reference/completions", - "name": "Generative Search - OpenAI" - }, - "multi2vec-clip": { - "clip_model": { - "_commit_hash": null, - "_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_clip-ViT-B-32/0_CLIPModel", - "add_cross_attention": false, - "architectures": [ - "CLIPModel" - ], - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1, - "logit_scale_init_value": 2.6592, - "max_length": 20, - "min_length": 0, - "model_type": "clip", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "text_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": 0, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": 2, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 512, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 2048, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "max_position_embeddings": 77, - "min_length": 0, - "model_type": "clip_text_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 8, - "num_beam_groups": 1, - "num_beams": 1, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": 1, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false, - "vocab_size": 49408 - }, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": "torch.float32", - "torchscript": false, - "transformers_version": null, - "typical_p": 1, - "use_bfloat16": false, - "vision_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 768, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "image_size": 224, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 3072, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "min_length": 0, - "model_type": "clip_vision_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 12, - "num_beam_groups": 1, - "num_beams": 1, - "num_channels": 3, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "patch_size": 32, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false - } - }, - "text_model": { - "_commit_hash": null, - "_name_or_path": "./models/text/0_CLIPModel", - "add_cross_attention": false, - "architectures": [ - "CLIPModel" - ], - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1, - "logit_scale_init_value": 2.6592, - "max_length": 20, - "min_length": 0, - "model_type": "clip", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "text_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": 0, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": 2, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 512, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 2048, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "max_position_embeddings": 77, - "min_length": 0, - "model_type": "clip_text_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 8, - "num_beam_groups": 1, - "num_beams": 1, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": 1, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false, - "vocab_size": 49408 - }, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": "torch.float32", - "torchscript": false, - "transformers_version": null, - "typical_p": 1, - "use_bfloat16": false, - "vision_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 768, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "image_size": 224, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 3072, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "min_length": 0, - "model_type": "clip_vision_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 12, - "num_beam_groups": 1, - "num_beams": 1, - "num_channels": 3, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "patch_size": 32, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false - } - } - }, - "text2vec-openai": { - "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings", - "name": "OpenAI Module" - }, - }, - "version": "1.23.9" -} -# END OutputGetMeta -""" - - -client.close() - - -# TryFinallyCloseDemo -import weaviate -import os - -# END TryFinallyCloseDemo - -client = weaviate.connect_to_local() - -# TryFinallyCloseDemo -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_local() - -try: - # Work with the client here - e.g.: - assert client.is_live() - pass - -finally: # This will always be executed, even if an exception is raised - client.close() # Close the connection & release resources -# END TryFinallyCloseDemo diff --git a/docs/academy/py/named_vectors/_snippets/102_collection.py b/docs/academy/py/named_vectors/_snippets/102_collection.py deleted file mode 100644 index 63eddb9bd..000000000 --- a/docs/academy/py/named_vectors/_snippets/102_collection.py +++ /dev/null @@ -1,155 +0,0 @@ -# CreateMovieCollection -import weaviate - -# CreateMovieCollection # SubmoduleImport -import weaviate.classes.config as wc - -# CreateMovieCollection # END SubmoduleImport - -# END CreateMovieCollection -client = weaviate.connect_to_local(port=8280, grpc_port=50251) - -# CreateMovieCollection -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_local(headers=headers) - -# END CreateMovieCollection - -# Actual instantiation - -client.collections.delete("MovieNVDemo") - -# CreateMovieCollection -client.collections.create( - name="MovieNVDemo", # The name of the collection ('NV' for named vectors) - properties=[ - wc.Property(name="title", data_type=wc.DataType.TEXT), - wc.Property(name="overview", data_type=wc.DataType.TEXT), - wc.Property(name="vote_average", data_type=wc.DataType.NUMBER), - wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY), - wc.Property(name="release_date", data_type=wc.DataType.DATE), - wc.Property(name="tmdb_id", data_type=wc.DataType.INT), - wc.Property(name="poster", data_type=wc.DataType.BLOB), - ], - # Define & configure the vectorizer module - vector_config=[ - # NamedVectorConfig # CreateMovieCollection - # Vectorize the movie title - wc.Configure.Vectors.text2vec_openai( - name="title", source_properties=["title"] - ), - # Vectorize the movie overview (summary) - wc.Configure.Vectors.text2vec_openai( - name="overview", source_properties=["overview"] - ), - # Vectorize the movie poster & title - wc.Configure.Vectors.multi2vec_clip( - name="poster_title", - image_fields=[ - wc.Multi2VecField(name="poster", weight=0.9) - ], # 90% of the vector is from the poster - text_fields=[ - wc.Multi2VecField(name="title", weight=0.1) - ], # 10% of the vector is from the title - ), - # END NamedVectorConfig # CreateMovieCollection - ], - # Define the generative module - generative_config=wc.Configure.Generative.openai(), -) - -client.close() -# END CreateMovieCollection - - -# BatchImportData -import weaviate -import pandas as pd -import requests -from datetime import datetime, timezone -import json -from weaviate.util import generate_uuid5 -from tqdm import tqdm -import os -import zipfile -from pathlib import Path -import base64 - -# END BatchImportData -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} -client = weaviate.connect_to_local(port=8280, grpc_port=50251, headers=headers) - -# BatchImportData -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_local() - -# END BatchImportData - -# BatchImportData -data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -resp = requests.get(data_url) -df = pd.DataFrame(resp.json()) - -# Create a directory for the images -img_dir = Path("scratch/imgs") -img_dir.mkdir(parents=True, exist_ok=True) - -# Download images -posters_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_posters.zip" -posters_path = img_dir / "movies_data_1990_2024_posters.zip" -posters_path.write_bytes(requests.get(posters_url).content) - -# Unzip the images -with zipfile.ZipFile(posters_path, "r") as zip_ref: - zip_ref.extractall(img_dir) - -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# END BatchImportData - -# df = df[:50] # Limit to 50 for testing purposes - -# BatchImportData -# Enter context manager -with movies.batch.fixed_size(50) as batch: - # Loop through the data - for i, movie in tqdm(df.iterrows()): - # Convert data types - # Convert a JSON date to `datetime` and add time zone information - release_date = datetime.strptime(movie["release_date"], "%Y-%m-%d").replace( - tzinfo=timezone.utc - ) - # Convert a JSON array to a list of integers - genre_ids = json.loads(movie["genre_ids"]) - # Convert image to base64 - img_path = img_dir / f"{movie['id']}_poster.jpg" - with open(img_path, "rb") as file: - poster_b64 = base64.b64encode(file.read()).decode("utf-8") - - # Build the object payload - movie_obj = { - "title": movie["title"], - "overview": movie["overview"], - "vote_average": movie["vote_average"], - "genre_ids": genre_ids, - "release_date": release_date, - "tmdb_id": movie["id"], - "poster": poster_b64, - } - - # Add object to batch queue - batch.add_object( - properties=movie_obj, - uuid=generate_uuid5(movie["id"]), - ) - # Batcher automatically sends batches - -# Check for failed objects -if len(movies.batch.failed_objects) > 0: - print(f"Failed to import {len(movies.batch.failed_objects)} objects") - for failed in movies.batch.failed_objects: - print(f"e.g. Failed to import object with error: {failed.message}") - -client.close() diff --git a/docs/academy/py/named_vectors/_snippets/103_searches.py b/docs/academy/py/named_vectors/_snippets/103_searches.py deleted file mode 100644 index 8c959955f..000000000 --- a/docs/academy/py/named_vectors/_snippets/103_searches.py +++ /dev/null @@ -1,204 +0,0 @@ -# START-ANY -import weaviate -import weaviate.classes.query as wq -import os - -# END-ANY - -# FilteredSemanticSearch -from datetime import datetime - -# END FilteredSemanticSearch - -# START-ANY - -# END-ANY - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} -client = weaviate.connect_to_local( - port=8280, - grpc_port=50251, - headers=headers -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_local(headers=headers) - -# END-ANY - - -# NVTitleSearch -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -response = movies.query.near_text( - query="A joyful holiday film", - # highlight-start - target_vector="title", # The target vector to search against - # highlight-end - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - return_properties=["title", "release_date", "tmdb_id", "poster"] -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year, o.properties["tmdb_id"] - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END NVTitleSearch - - -print("\n\n") - -client.connect() - - -# NVOverviewSearch -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -response = movies.query.near_text( - query="A joyful holiday film", - # highlight-start - target_vector="overview", # The target vector to search against - # highlight-end - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - return_properties=["title", "release_date", "tmdb_id", "poster"] -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year, o.properties["tmdb_id"] - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END NVOverviewSearch - - -print("\n\n") - -client.connect() - - -# NVPosterSearch -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -response = movies.query.near_text( - query="A joyful holiday film", - # highlight-start - target_vector="poster_title", # The target vector to search against - # highlight-end - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - return_properties=["title", "release_date", "tmdb_id", "poster"] -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year, o.properties["tmdb_id"] - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END NVPosterSearch - - -print("\n\n") - -client.connect() - - -# MetadataMultimodalSearch - -def url_to_base64(url): - import requests - import base64 - - image_response = requests.get(url) - content = image_response.content - return base64.b64encode(content).decode("utf-8") - - -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -src_img_path = "https://github.com/weaviate-tutorials/edu-datasets/blob/main/img/International_Space_Station_after_undocking_of_STS-132.jpg?raw=true" -query_b64 = url_to_base64(src_img_path) - -response = movies.query.near_image( - near_image=query_b64, - limit=5, - target_vector="poster_title", # The target vector to search against - return_metadata=wq.MetadataQuery(distance=True), - return_properties=["title", "release_date", "tmdb_id", "poster"] # To include the poster property in the response (`blob` properties are not returned by default) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year, o.properties["tmdb_id"] - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END MetadataMultimodalSearch - - -print("\n\n") - -client.connect() - - -# MetadataHybridSearch -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -response = movies.query.hybrid( - query="history", - # highlight-start - target_vector="overview", # The target vector to search against - # highlight-end - limit=5, - return_metadata=wq.MetadataQuery(score=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Hybrid score: {o.metadata.score:.3f}\n" - ) # Print the hybrid search score of the object from the query - -client.close() -# END MetadataHybridSearch - - -print("\n\n") - -client.connect() diff --git a/docs/academy/py/named_vectors/_snippets/104_usecase.py b/docs/academy/py/named_vectors/_snippets/104_usecase.py deleted file mode 100644 index 99605709d..000000000 --- a/docs/academy/py/named_vectors/_snippets/104_usecase.py +++ /dev/null @@ -1,91 +0,0 @@ -# START-ANY -import os -import weaviate -import os - -# END-ANY - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} -client = weaviate.connect_to_local( - port=8280, - grpc_port=50251, - headers=headers -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_local(headers=headers) - -# END-ANY - -# PosterSearchGeneration - -def url_to_base64(url): - import requests - import base64 - - image_response = requests.get(url) - content = image_response.content - return base64.b64encode(content).decode("utf-8") - - -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -src_img_path = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/img/1927_Boris_Bilinski_(1900-1948)_Plakat_f%C3%BCr_den_Film_Metropolis%2C_Staatliche_Museen_zu_Berlin.jpg" -query_b64 = url_to_base64(src_img_path) - -# response = movies.generate.near_text( -# query="Science fiction film set in space", -response = movies.generate.near_image( - near_image=query_b64, - limit=10, - # highlight-start - target_vector="poster_title", # The target vector to search against - grouped_task="What types of movies are these, and what kinds of audience might this set of movies be aimed at overall?", - grouped_properties=["title", "overview"] # Optional parameter; for reducing prompt length - # highlight-end -) - -# Inspect the response -for o in response.objects: - print(o.properties["title"]) # Print the title -print(response.generative.text) # Print the generated text (the commonalities between them) - -client.close() -# END PosterSearchGeneration - - -print("\n\n") - -client.connect() - - -# TitleSerachLoop -# Get the collection -movies = client.collections.use("MovieNVDemo") - -# Perform query -# highlight-start -# Loop through the target vectors -for tgt_vector in ["title", "overview"]: -# highlight-end - response = movies.generate.near_text( - query="Chrono Tides: The Anomaly Rift", - limit=5, - # highlight-start - target_vector=tgt_vector, # The target vector to search against - grouped_task="What types of movies are these, and what kinds of audience might this set of movies be aimed at overall?", - grouped_properties=["title", "overview"] # Optional parameter; for reducing prompt length - # highlight-end - ) - - # Inspect the response - for o in response.objects: - print(o.properties["title"]) # Print the title - print(response.generative.text) # Print the generated text (the commonalities between them) - -client.close() -# END TitleSerachLoop diff --git a/docs/academy/py/named_vectors/index.md b/docs/academy/py/named_vectors/index.md deleted file mode 100644 index 9b42080f0..000000000 --- a/docs/academy/py/named_vectors/index.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: "220 Named vectors" -description: "Understand the use of named vectors in Weaviate for enhanced query relevance in Python." -sidebar_position: 220 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -Sometimes, you might wish to provide multiple ways to represent the same data. For example, you might want to represent an article using its body, its title, or both. - -Named vectors enable this capability. With named vectors, you can store multiple vector embeddings per one object, then search for the object using any of the vector spaces. This provides a great deal of flexibility in how you can represent and search for your data. - -This course will teach you how to use named vectors through the lens of multimodality. It will show you how to use named vectors to represent and search for movies, using their text properties such as the title or the summary, or their visual properties such as the poster. - -If you do not wish to use multimodal data, that's okay! The concepts you learn in this course can be applied to any kind of data, or any kind of vectorizer. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/standalone/_202_vectorizer_selection/10_vectorization_in_weaviate.mdx b/docs/academy/py/standalone/_202_vectorizer_selection/10_vectorization_in_weaviate.mdx deleted file mode 100644 index fa181dc34..000000000 --- a/docs/academy/py/standalone/_202_vectorizer_selection/10_vectorization_in_weaviate.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Vectorization in Weaviate ---- - -- How Weaviate obtains vectors -- Weaviate modules - - Modules & models -- Local vs. API-based vectorization diff --git a/docs/academy/py/standalone/_202_vectorizer_selection/20_requirements.mdx b/docs/academy/py/standalone/_202_vectorizer_selection/20_requirements.mdx deleted file mode 100644 index 5c2cb7d7a..000000000 --- a/docs/academy/py/standalone/_202_vectorizer_selection/20_requirements.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Vectorizer requirements ---- - -- Vectorizer capabilities -- Vectorizer cost -- Vectorizer availability -- Vectorizer location (data residency) diff --git a/docs/academy/py/standalone/_202_vectorizer_selection/30_performance.mdx b/docs/academy/py/standalone/_202_vectorizer_selection/30_performance.mdx deleted file mode 100644 index 2bb33d81a..000000000 --- a/docs/academy/py/standalone/_202_vectorizer_selection/30_performance.mdx +++ /dev/null @@ -1,10 +0,0 @@ ---- -title: Vectorizer performance ---- - -- Inference speed -- Search quality - - Benchmarks -- Resource demands - - During inference - - On Weaviate diff --git a/docs/academy/py/standalone/_202_vectorizer_selection/40_compression.mdx b/docs/academy/py/standalone/_202_vectorizer_selection/40_compression.mdx deleted file mode 100644 index 8127b9f24..000000000 --- a/docs/academy/py/standalone/_202_vectorizer_selection/40_compression.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: Vector (index) compression ---- - -- What is vector compression -- Vector compression in Weaviate - - Impact of vector compression -- Vectorizer performance with compression diff --git a/docs/academy/py/standalone/_202_vectorizer_selection/50_use_in_weaviate.mdx b/docs/academy/py/standalone/_202_vectorizer_selection/50_use_in_weaviate.mdx deleted file mode 100644 index 4e26312b6..000000000 --- a/docs/academy/py/standalone/_202_vectorizer_selection/50_use_in_weaviate.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Vectorizer selection in Weaviate ---- - -- Where to find the vectorizer configuration -- How to set the vectorizer -- How to change the vectorizer diff --git a/docs/academy/py/standalone/_202_vectorizer_selection/index.mdx b/docs/academy/py/standalone/_202_vectorizer_selection/index.mdx deleted file mode 100644 index 6a34e0070..000000000 --- a/docs/academy/py/standalone/_202_vectorizer_selection/index.mdx +++ /dev/null @@ -1,47 +0,0 @@ ---- -title: 202 Vectorizer selection -sidebar_position: 101 ---- - -## Unit overview - - - -:::warning TODO -Intro video here -::: - - - -Weaviate integrates with a variety of vectorizer models, each designed to create vector representations, or embeddings, of data. - -These vectorizers differ in capabilities, use cases, resource demands, and costs, among other factors. Selecting the most appropriate vectorizer for your specific needs is crucial. - -In this unit, you will explore the range of vectorizers available in Weaviate and learn how to choose the best-suited vectorizer for your application. - - -### Prerequisites - -- A foundational understanding of vector representations. -- Familiarity with Weaviate's vector search capabilities. -- Intermediate proficiency in Python programming. - - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/standalone/_203_indexing/_60_indexing_options.mdx b/docs/academy/py/standalone/_203_indexing/_60_indexing_options.mdx deleted file mode 100644 index ed82df045..000000000 --- a/docs/academy/py/standalone/_203_indexing/_60_indexing_options.mdx +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: Indexing options ---- - -### Tokenization options - -The inverted index is built using a tokenization method, which determines how the text is split into tokens. They are: - -| Tokenization Method | Explanation | Example Input | Indexed Tokens | -|---------------------|-------------|-------------- |----------------| -| `word` (default) | Keep alpha-numeric characters, lowercase them, and split by whitespace. | `Conference (Chicago, IL)` | `conference`, `chicago`, `il` | -| `whitespace` | Split the text on whitespace. | `Conference (Chicago, IL)` | `Conference`, `(Chicago,`, `IL)` | -| `lowercase` | Lowercase the text and split on whitespace. | `Conference (Chicago, IL)` | `conference`, `(chicago,`, `il)` | -| `field` | Index the whole field after trimming whitespace characters. | `Conference (Chicago, IL)` | `Conference (Chicago, IL)` | - - -So, a filter for matching (i.e. `Equal`) the term "conference" will return the object ID if the `word` tokenization is used, but not if the `whitespace` tokenization is used (as the case would not match). - -## Review - -### Review exercise - -:::caution TODO -Add review exercises -::: - -### Key takeaways - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/py/standalone/_203_indexing/index.mdx b/docs/academy/py/standalone/_203_indexing/index.mdx deleted file mode 100644 index ca127bb8b..000000000 --- a/docs/academy/py/standalone/_203_indexing/index.mdx +++ /dev/null @@ -1,45 +0,0 @@ ---- -title: 202 Vector, keyword, or hybrid search? -sidebar_position: 101 ---- - -## Unit overview - - - -:::warning TODO -Intro video here -::: - - - -Welcome! - -This is the beginning of your journey through the world of vectors with Weaviate. This unit will provide you with an overview of the fundamentals of Weaviate. - -You'll first gain an understanding of what Weaviate is, and what it can do. You will then learn about what vector database and vector search are before going on to run Weaviate and perform vector searches yourself. - -By the end of this unit, you will have a strong foundation of knowledge that will help you to effectively navigate the rest of the course, and for using Weaviate in your own projects. - -### Prerequisites - -- None - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/standalone/_others/_inside_weaviate.mdx b/docs/academy/py/standalone/_others/_inside_weaviate.mdx deleted file mode 100644 index 6788f20da..000000000 --- a/docs/academy/py/standalone/_others/_inside_weaviate.mdx +++ /dev/null @@ -1,318 +0,0 @@ ---- -title: A peek inside Weaviate -sidebar_position: 50 ---- - -## Introduction - -:::warning TODO -Intro video here -::: - -So far, you have been introduced to the concepts of semantic and vector search, as well as have gained hands-on experience with Weaviate. - -Now, it's time to dive deeper into the inner workings of Weaviate. In this section, we will explore the following topics in high-level detail: - -- How Weaviate performs vector searches -- The storage of data in Weaviate -- The source of vectors used by Weaviate - -This information will help you understand the mechanisms behind the powerful search capabilities of Weaviate, enabling you to make the most of this tool in your development projects. - - -#### ML models and vectorization - -A machine learning model is what will turn data, like a piece of text, an image, an audio clip, and more into a vector. This type of models are often also referred to as "vectorizers". And measures of similarity will depend on the particular machine learning model used. - -As a result, it is paramount that the right model for the job is chosen. Or, at least, that a wrong model is not chosen. So throughout our courses, you will see many different models and learn about model selection both as rules of thumb and detailed discussions. - -Weaviate does not itself vectorize data, but it is built to be compatible with a huge number of vectorization models and services. Our courses will also cover different ways to configure Weaviate to vectorize your data in the way that suits your goals and needs. - -### Fast data retrieval - -![](../../tmp_images/orgainized_filing_cabinets_with_many_folders_1.jpg) - -To retrieve data quickly and efficiently, Weaviate uses a vector index in addition to an object index. The vector index allows for fast information retrieval, while still maintaining high recall performance. However, as the size of the dataset increases, the memory requirements and retrieval times must be balanced. - -Weaviate uses an "approximate nearest neighbor" (ANN) algorithm for its indexing, which enables it to scale at a sublinear complexity. This means that the search time increases at a slower rate than the size of the dataset, making Weaviate highly scalable. - -Throughout the course, you'll learn about indexing in Weaviate and the HNSW index, which is Weaviate's primary vector index type. - -### Going to production - -![](../../tmp_images/a_machine_with_a_conveyer_belt_producing_boxes_of_documents.jpg) - -Vector databases often need to scale horizontally and vertically. The scale of vector databases can expand into billions of data objects, thus requiring indexing and retrieval of billions of vectors to be retrievable by many concurrent users. - -We appreciate that this presents additional and very different challenges to the data science aspect of vector search. To help you navigate these challenges, we'll cover all the key aspects of taking Weaviate to production in dedicated units of the course. - -All things related to taking Weaviate to production, such as authentication and authorization, backups, monitoring, replication and more will be covered in these units. - - - - - -## Data structure - -:::warning TODO -Intro video here -::: - -### Objects - -![](../../tmp_images/a_beautiful_document_page_with_a_long_number_on_top_and_lots_of_text_underneath_1.jpg) - -Each object in Weaviate is stored as a JSON document within its document store, with a unique [UUID](https://en.wikipedia.org/wiki/Universally_unique_identifier) serving as the key. An object can be structured like this: - -:::note TODO -Replace object with one from actual edu database -::: - -```json -{ - "id": "unique-uuid", // e.g. "779c8970-0594-301c-bff5-d12907414002" - "class": "Name of the class the object belongs to", // e.g. "Question" - "properties": { - "property_name_1": "property_value_1", // e.g. "answer": "Jupiter" - "property_name_2": "property_value_1", // e.g. "question": "This is the largest planet in the solar system." - ... - }, - "vector": "Vector representation of the object if available" // e.g. [-0.16147631, -0.065765485, ... ,-0.06546908] -} -``` - -In Weaviate, objects can be linked to one another using cross-references, similar to foreign keys in relational databases. The overall structure of data in Weaviate is defined by the `schema`. - -### Schema - -![](../../tmp_images/a_complex_architectural_blueprint_laid_out_on_the_table_0.jpg) - -A `schema` in Weaviate is the blueprint that defines its data structure. Here is an example schema structure: - -
- Example schema - TODO: Add actual example schema -
- -The schema specifies for each class the metadata such as its name and description as well as the same for properties of each class. Additionally, it defines any cross-references between classes and settings for indexes including the object index and the vector index. - -It's important to note that the schema holds information about the vectorization process, including which vectorization model to use, which properties to include or exclude, and whether to include property headings or not. - -:::info Auto-schema -Any missing information required for schema definition will be automatically inferred by Weaviate based on default values and the imported data. -::: - -#### Data types - -Each `class` definition will include one or more properties, which must have a data type. Currently, Weaviate data type support includes the following types: - -import DataTypes from '/_includes/datatypes.mdx'; - -
- Available data types in Weaviate - -
- -Note that most data types can include one such instance, or a list of instances, such as `string` or `string[]`. - -We will look at schema definition including data types in more detail in another unit. - - - -### Data indexing - -![](../../tmp_images/robot_arm_picking_up_a_box_from_many_many_orgainized_boxes_with_labels_0.jpg) - -To enable efficient search, Weaviate uses a number of indexes to store and retrieve data. - -#### Object index - -The object index allows for filtering based on specific properties. From version 1.18 onwards, Weaviate has added a Roaring Bitmap-based index to its object index, in addition to its existing inverted index. This significantly enhances retrieval speed, particularly for large datasets or result sets. - -#### Vector index - -Objects in Weaviate can be associated with vectors, which are collected and stored in the vector index. The vector index enables searches based on vector similarity and is built using an ANN (Approximate Nearest Neighbor) algorithm. - -A traditional k-nearest neighbor (kNN) similarity search can be slow and inefficient for larger datasets. In contrast, ANN-based indexes allow Weaviate to perform vector searches quickly while maintaining high recall. Weaviate currently uses an HNSW-based ANN index. - - - -## A closer look at the vector search process - -:::warning TODO -Intro video here -::: - -### Order of search sub-processes - -![](../../tmp_images/manufacturing_line_with_multiple_factory_robots_for_different_tasks_0.jpg) - -In Weaviate, vector search can be combined with an algebraic filter for more precise results. For example, you could search for quiz questions similar to a specific vector, but only within the "World History" category. - -The various components of Weaviate come together to make this type of search possible. - -When a vector search query is sent to Weaviate, it uses the object index to create an "allow list" of object IDs. This allow list is then used in conjunction with the vector search, so that the result set is the combination of the vector search results and the allow list. - -The resulting set of object IDs is then used to retrieve the desired properties from the object store, which are then returned to the user. - -:::info Pre-filtering -This method of pre-filtering occurs before the vector search, maximizing the chance of Weaviate returning the desired results even when the filter is restrictive. - -This type of filtering is called pre-filtering, as the filtering occurs prior to the vector search. This maximizes the chance of Weaviate returning the requested number of results even when the filter is restrictive. -::: - - - -### Sources of vectors - -![](../../tmp_images/a_machine_with_a_conveyer_belt_producing_many_long_numbers_0.jpg) - -Weaviate provides multiple ways for users to associate vector representations with objects. In general, users can either use Weaviate's vectorizer modules to obtain vector representations during the import process or supply their own vector representations. - -#### Using vectorizer modules - -If you want to obtain vectors at the time of import, you can use a Weaviate vectorizer module. These modules can be categorized into two groups: those (`text2vec-openai`, `text2vec-cohere`, `text2vec-huggingface`) that call external inference APIs, and those that run locally as a microservice to perform vectorization (e.g. `text2vec-transformers`). - -Weaviate's set of vectorizer modules is versatile and extensible. It includes modules for vectorizing text or images, a module that can vectorize both text and images into a single vector space (`multi2vec-clip`), and one that generates vectors based on other vectors (`ref2vec-centroid`). - -Custom modules can also be created to extend the capabilities of Weaviate as needed. - -#### Uploading vectors - -Another option for adding vectors to Weaviate is to manually upload them. This is sometimes referred to as the "Bring Your Own Vector" option in our documentation, and may be a good choice for users who prefer to generate vectors outside of Weaviate or who have already vectorized their data. - -:::note Upload vectors and use a vectorizer -It is possible to both upload your own vectors and specify a vectorizer for Weaviate. For example, if you are importing a large dataset and have vectorized the data using a vectorizer that is also available through Weaviate, this may be a useful approach. This will allow you to use Weaviate to vectorize any updates to the dataset, as well as to vectorize queries as necessary. -::: - -We will explore these options in more detail in another unit. - - - -## Review - -:::warning TODO -Video here -::: - -### Review exercise - -Can you describe, in your own sentence, XXX? - -:::warning TODO -Input box for user to put answer in and get back a similarity score & our definition? -?? -::: - -### Key takeaways - -:::info -Add summary -::: - -import Quiz from '/src/components/Academy/quiz.js' -const vectorizationAndML = [ - { - questionText: 'What does Weaviate use to obtain a vector corresponding to a data object?', - answerOptions: [ - { answerText: 'Weaviate is capable of creating a vector from data objects.', isCorrect: false, feedback: 'Weaviate is not itself a vectorizer.'}, - { answerText: 'Weaviate uses modules to obtain vectors from machine learning models.', isCorrect: true, feedback: 'You will learn about vectorizer models and corresponding Weaviate modules later on.'}, - { answerText: 'Weaviate requires a vector to be uploaded at import time.', isCorrect: false, feedback: 'While you can upload a vector, this is not required.'}, - ], - }, -]; -const weaviateIndexing = [ - { - questionText: 'What types of indexes does Weaviate use?', - answerOptions: [ - { answerText: 'Weaviate does not rely on indexing as it relies on a kNN vector search.', isCorrect: false, feedback: 'Weaviate does not use a kNN search as it is computationally very expensive.'}, - { answerText: 'Weaviate uses a vector index.', isCorrect: false, feedback: 'This is only partially true.'}, - { answerText: 'Weaviate uses an inverted index.', isCorrect: false, feedback: 'This is only partially true.'}, - { answerText: 'Weaviate uses a vector index and an inverted index.', isCorrect: true, feedback: 'Both indexes are used for different, and complementary reasons.'}, - ], - }, -]; -const dataStorageComponents = [ - { - questionText: 'What are the key components of data storage and retrieval system in Weaviate?', - answerOptions: [ - { answerText: 'The object store, object index, and vector index.', isCorrect: true, feedback: 'Great job!'}, - { answerText: 'The object index and vector index.', isCorrect: false, feedback: 'That is partially correct, but missing a component.'}, - { answerText: 'The object store, vector store, and vector index.', isCorrect: false, feedback: 'There is no separate "vector store" with the "vector index".'}, - ], - }, -]; -const objectIndex = [ - { - questionText: 'How does Weaviate use the object index when a vector search query with a filter is sent?', - answerOptions: [ - { - answerText: 'It uses the object index to compare the query vector to objects.', - isCorrect: false, - feedback: 'Weaviate uses the vector index for this purpose.', - }, - { - answerText: 'It uses the object index to create an "allow list" of object IDs.', - isCorrect: true, - feedback: 'The allow list is then combined with results from the vector search', - }, - { - answerText: 'It uses the object index to build an ANN-based index.', - isCorrect: false, - feedback: 'The ANN-based (i.e. vector) index is built already, and it is not based on the object index.', - }, - ] - } -]; -const autoSchema = [ - { - questionText: 'What happens if required information is unspecified in the user-defined schema?', - answerOptions: [ - { - answerText: 'Weaviate will throw an error at import.', - isCorrect: false, - feedback: 'That is not true. Try again.', - }, - { - answerText: 'Data will not be imported due to a lack of schema definition.', - isCorrect: false, - feedback: 'That is not true. Try again.', - }, - { - answerText: 'Weaviate will implicitly use its default values.', - isCorrect: false, - feedback: 'Weaviate does have default values for it to use in this situation, but it will not be implicit. Please try again..', - }, - { - answerText: 'Weaviate will infer required information using the auto-schema function.', - isCorrect: true, - feedback: 'The inference will be based on the defaults as well as the data being imported.', - }, - ] - } -]; -const vectorizeObject = [{ - questionText: 'Which of the following is NOT a way to associate vector representations with objects in Weaviate?', - answerOptions: [ - { - answerText: 'Upload vectors to Weaviate', - isCorrect: false, - feedback: 'You can upload vectors to Weaviate.', - }, - { - answerText: 'Use an inference API such as OpenAI, Cohere or Hugging Face.', - isCorrect: false, - feedback: 'You can use Weaviate modules to use these inference APIs.', - }, - { - answerText: 'Rely on Weaviate Database to generate vectors.', - isCorrect: true, - feedback: 'Although it can use modules to do so, Weaviate itself cannot generate vectors.', - }, - { - answerText: 'Use a local vectorizer microservice module.', - isCorrect: false, - feedback: 'Weaviate modules such as text2vec-transformers are examples of local vectorizer microservices that can generate vectors.', - }, - ] -}]; diff --git a/docs/academy/py/standalone/_others/_vectorizer_intro.md b/docs/academy/py/standalone/_others/_vectorizer_intro.md deleted file mode 100644 index 522e8a7f1..000000000 --- a/docs/academy/py/standalone/_others/_vectorizer_intro.md +++ /dev/null @@ -1,39 +0,0 @@ ---- -title: Introduction to vectorizers -sidebar_position: 40 ---- - -## Background (Why this module?) - -This unit aims to provide you with tools to help you make good vectorizer selections. - -In the earlier unit on vector search essentials [INSERT LINK HERE], you learned that vector databases can store each object with an associated vector. -But choosing a vectorizer model can be a daunting task for anyone, including for data scientists. Here are some reasons why: - -### Too many models - -One problem is that there are just so many models available. - -To give you an idea, the Hugging Face Hub contains over 60,000 models (as of January 2023)! - -![placeholder image for confusion](https://images.unsplash.com/photo-1499334758287-dc8133b315e9?ixlib=rb-4.0.3&ixid=MnwxMjA3fDB8MHxwaG90by1wYWdlfHx8fGVufDB8fHx8&auto=format&fit=crop&w=500&q=80) - -### Variable performance - -Another problem is that there is no "best" model for everything. - -Each model can only vectorize certain types of asset(s) such as text, image or audio, and even within the same asset type, it is common for a model to perform better at certain tasks than others. - -### Rate of progress - -Lastly, new models are being developed and released every day. Such is the rate of development in language models that what was state-of-the-art a year ago, or even a month ago, may be considered "old news" in some circles. - -All of this can be a recipe for decision paralysis. - -## Don't panic! - -The good news, however, is that you do not need to select the "perfect model" for your application to work well. - -Vectorizer models have progressed to a point where many models perform admirably well in a variety of tasks. - -In the next section, we'll learn about commonly used types of vectorizer models available \ No newline at end of file diff --git a/docs/academy/py/standalone/chunking/10_introduction.mdx b/docs/academy/py/standalone/chunking/10_introduction.mdx deleted file mode 100644 index 51302fc46..000000000 --- a/docs/academy/py/standalone/chunking/10_introduction.mdx +++ /dev/null @@ -1,130 +0,0 @@ ---- -title: A brief introduction to chunking -description: Understand the fundamentals of chunking in Weaviate's Python SDK. ---- - - - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## What is chunking? - -Chunking is the pre-processing step of splitting texts into smaller pieces of texts, i.e. "chunks". - -You know by now that a vector database stores objects by corresponding vectors to capture their meaning. But just *how much* text does each vector capture the meaning of? Chunking defines this. Each chunk is the unit of information that is vectorized and stored in the database. - -Consider a case where the source text comprises a set of books. A chunking method could conceivably split the text into a set of chapters, paragraphs, sentences, or even words, into chunks. - -While this is a simple concept, it can have a significant impact on the performance of vector databases, and outputs of language models. - -## Why chunk data? - -### For information retrieval - -Let's go back to the above example, building a vector database from a set of books. - -At one extreme, you could catalog each book as one vector. This would build a database that's similar to a library catalog. - -It would be good for finding books that are most similar to the query string. But because each vector captures a book, it is unlikely to be so useful for more granular tasks - like finding specific information within a book. - -At the other extreme, you could catalog each sentence as a vector. This would build a database that's similar to a thesaurus - albeit, at a sentence level. This would be good for finding specific concepts or information conveyed by the writer. But it might not work so well for finding broader information, such as the idea conveyed by a book, or even a chapter. - -Whether to choose one or the other approach, or a third approach in-between, depends on your use case. And we'll talk more about some rules of thumb and key considerations later on. - -But the key point for now is that chunking defines the unit of information that is stored in the database, and therefore each unit of information to be retrieved. - -As we will see later on, this has implications not only search, but also retrieval augmented generation (RAG) use case downstream. - -### To meet model requirements - -Another reason to chunk data is to help meet the requirements of the language model used. - -These models typically have a finite "window" of text input lengths, and source texts often exceed this length. Remember that the Lord of the Rings, for example, is over 500,000 words long! - -A typical "context window" of these models are in the order of thousands of "tokens" (words, parts of words, punctuation, etc.). This means that the text input to the model must be split into chunks of this size or smaller. - -### For optimal retrieval augmented generation (RAG) - -Chunking is also important for optimizing retrieval augmented generation, or RAG. (If you need a refresher on what RAG is, you can review [this module](../../zero_to_mvp/104_queries_2/30_generative.mdx).) - -In short, RAG allows you to ground large language models (LLMs) by providing retrieved data from a database along with a prompt. This in turn can prevent the model from generating factually incorrect information due to outdated or missing data. - -So why does chunking affect RAG? This is because LLMs currently have a finite maximum input size, called the context window. As a result, the chunk size defines how many chunks can be included in the context window. This in turn defines how many different places the LLM can retrieve information from, and how much information is in each object. - -Let's consider what happens when you use a chunk size that is too small or too large. - -#### (Too) Small chunks - -Using short chunks, you can add information from more chunks to the LLM. However, it may lead to insufficient contextual information being passed on in each result to the LLM. - -Imagine passing the following as a chunk to the LLM: `In the dense areas, most of the concentration is via medium- and high-rise buildings.`. This tells a lot about the nature of this area, but without further context it's not useful to the LLM. Where is this sentence about? Why are we talking about the density anyway? As a human reader, it's not clear at all to us, and it would be the same for the LLM, which would have to guess at these answers. - -Contrast that with instead passing: `Wikipedia: London: Architecture: In the dense areas, most of the concentration is via medium- and high-rise buildings. London's skyscrapers, such as 30 St Mary Axe (dubbed "The Gherkin"), Tower 42, the Broadgate Tower and One Canada Square, are mostly in the two financial districts, the City of London and Canary Wharf.` as a chunk. - -This includes more contextual information to the LLM, such as the source, article title, section title, and additional sentences. It is far clearer to us, and would be to the LLM as well. - -#### (Too) Large chunks - -Of course, on the other hand, using larger chunks would mean fewer chunks would fit into the context window of the LLM, or incur additional costs. And it may increase the amount of irrelevant information in the data. - -Taking this to the logical conclusion, imagine you could only pass one contiguous passage of text to the LLM. It would be like being asked to write an essay based only on one section of a book. - -Either extremes are not ideal, and the trick is to find a balance that works for you. - -## Chunk size selection - -As you can start to see, there are multiple factors at play to help you choose the right chunk size. - -Unfortunately, there isn't a chunk size or chunking technique that works for everybody. The trick here is to find a size that works for *you* - one that isn't too small or too large, and also chunked with a method that suits you. - -In the next unit, we'll begin to review these ideas, starting with some common chunking techniques. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - - \ No newline at end of file diff --git a/docs/academy/py/standalone/chunking/20_how_1.mdx b/docs/academy/py/standalone/chunking/20_how_1.mdx deleted file mode 100644 index 356cac79f..000000000 --- a/docs/academy/py/standalone/chunking/20_how_1.mdx +++ /dev/null @@ -1,199 +0,0 @@ ---- -title: Chunking techniques - 1 -description: Explore step one in implementing chunking techniques with Weaviate. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import CodeFixedSizeChunking from '!!raw-loader!./_snippets/20_chunking_methods.1.fixed.size.py'; -import CodeVariableSizeChunking from '!!raw-loader!./_snippets/20_chunking_methods.2.variable.size.py'; -import CodeMixedStrategyChunking from '!!raw-loader!./_snippets/20_chunking_methods.3.mixed.strategy.py'; - - - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## Overview - -Now that you've learned about what chunking is, and why it is important, you are ready to start looking at practical chunking techniques. Here, we start by looking at **fixed-size** chunking techniques, including some example implementations. - -## Fixed-size chunking - -As the name suggests, fixed-size chunking refers to the process of splitting texts into chunks of a fixed size, or at least based on size. Using fixed size chunking, you might split an article into a set of chunks of 100 words each, or a set of 200 characters each. - -This may be the most common chunking technique due to its simplicity and effectiveness. - -### Implementations - -Fixed-size chunking is implemented by splitting texts into chunks of a fixed number of units. The units may be composed of words, characters, or even *tokens*, and the number of units per chunk is fixed (to a maximum), with an optional overlap. - -:::tip What is a token? -A "token" in this context is a unit of text that will be processed by a model by being substituted with a number. In modern tranformer models, a token is commonly a "subword" unit composed of a few characters. -::: - -One pseudocode implementation of fixed-size chunking is: - -```python -# Given a text of length L -# Split the text into chunks of size N units (e.g. tokens, characters, words) -# Optionally, add an overlap of M units at the beginning or end of each chunk (from the previous or next chunk) -# This should typically result in a list of chunks of length L // N + 1 -``` - -And implementing in Python, it may look like: - - - - - - - -Which can be modified to include an overlap (in this case, at the beginning of each chunk): - - - - - - - -This is far from the only way to implement fixed-size chunking, but it is one possible, relatively simple, implementation. - -:::note Exercise -Consider how *you* might implement fixed-size chunking. What would your pseudocode (or code) look like? -::: - -### Examples - -We are ready to look at some concrete examples of fixed-size chunking. Let's take a look at three examples, with a chunk size of 5 words, 25 words and 100 words, respectively. - -We'll use an excerpt from the [Pro Git book](https://git-scm.com/book/en/v2)*. More specifically, we'll use text of the [What is Git?](https://github.com/progit/progit2/blob/main/book/01-introduction/sections/what-is-git.asc) chapter. - -Here is one example using our chunking function from above: - - - - - - - -This will result in outputs like these. Take a look at the first few chunks at each size - what do you notice? - -:::note Exercise -Consider which of these chunk sizes would be most appropriate for search. Why do you think so? What are the tradeoffs? -::: - - - - - - - - - - - - - - -Hopefully, these concrete examples start to illustrate some of the ideas that we discussed above. - -Immediately, it strikes me that the smaller chunks are very granular, to the point where they may not contain enough information to be useful for search. On the other hand, the larger chunks begin to retain more information as they get to lengths that are similar to a typical paragraph. - -Now imagine these chunks becoming even longer. As chunks become longer, the corresponding vector embeddings would start to become more general. This would eventually reach a point where they cease to be useful in terms of searching for information. - -:::note What about character or sub-word tokenization? -At these sizes, you typically will not need to employ character-based or sub-word token-based chunking, as splitting words at these boundaries in a group of words will not typically be meaningful. -::: - -:::tip Where to start? -For search with fixed-size chunks, if you don't have any other factors, try a size of around 100-200 words, and a 20% overlap. -::: - -## Notes - -:::info Pro Git by Scott Chacon and Ben Straub - Book License - -*Available through the Creative Commons Attribution-Non Commercial-Share Alike 3.0 license. - -::: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - - \ No newline at end of file diff --git a/docs/academy/py/standalone/chunking/25_how_2.mdx b/docs/academy/py/standalone/chunking/25_how_2.mdx deleted file mode 100644 index a591a3754..000000000 --- a/docs/academy/py/standalone/chunking/25_how_2.mdx +++ /dev/null @@ -1,202 +0,0 @@ ---- -title: Chunking techniques - 2 -description: Master advanced methods for chunking large data with Weaviate. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import CodeFixedSizeChunking from '!!raw-loader!./_snippets/20_chunking_methods.1.fixed.size.py'; -import CodeVariableSizeChunking from '!!raw-loader!./_snippets/20_chunking_methods.2.variable.size.py'; -import CodeMixedStrategyChunking from '!!raw-loader!./_snippets/20_chunking_methods.3.mixed.strategy.py'; - - - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## Overview - -We continue our discussion of chunking techniques by taking you through **variable-size** chunking techniques, including some example implementations. - -## Variable-size chunking - -Now let's look at variable-size chunking. Unlike fixed-size chunking, the chunk size here is an *outcome*, rather than an *input parameter*. - -In variable-size chunking, some marker is used to split the text. The marker may be syntactic, such as a sentence or paragraph marker, or even structural such as a markdown header. - -### Implementations - -A pseudocode implementation of variable-size chunking could look like this: - -```python -# Given a text of length L -# Identify a marker (e.g. full-stop, paragraph marker (two newlines), or a Markdown header -# Split the text into chunks at each marker -``` - -Which could be implemented in Python as below: - - - - - - - -Or, we could use special markers - such as Markdown headers - to split the text. - -(Since the *Pro Git* book is written in [Asciidoc](https://asciidoc.org/), we'll use Asciidoc headers instead - they all start with new lines followed by `==`.) - - - - - - - -Again, let's now apply these to concrete examples. - -### Examples - -We can apply these splitters to the same text as before. - - - - - - - -And the outputs look like this. Now, what do you observe? - - - - - - - - - - -One thing that might stand out immediately is that both of our very simple marker-based chunker ends up extracting the heading as one chunk, which may not be desirable. - -In reality, you may employ a mixed strategy where very short chunks like this may be appended to the next chunk, assuming that it is likely to be something like a title, or a section heading. - -Let's take a look at such a strategy. - - -## Mixed strategy - -You could use a mix of fixed-size chunking and variable-size chunking to get the best of both worlds. For example, you could use a variable-size chunker to split the chunks at paragraph markers, but apply a fixed-size filter. - -More specifically, any chunks that are too small could be merged with the next chunk, and/or any chunks that are too large could be split at the middle, or at another marker within the chunk. - -### Examples - -One implementation may look as follows: - -```python -# Given a text of length L -# Identify a marker (e.g. full-stop, paragraph marker (two newlines), or a Markdown header -# Split the text into chunks at each marker -# If any of the chunks are too small, merge them with the next chunk -# If any of the chunks are too large, split them - e.g. at the middle or using another marker within the chunk -``` - -Which could be implemented in Python like this: - - - - - - - -Producing these chunks. - - - -This strategy will not produce chunks that are too small, while still basing them based on a syntactic marker, respecting the boundary of a heading. - -Since we've seen chunking strategies in action on a single text, let's now look at how they may work on a larger set of texts. We'll also take a look at what retrieval results may look like, using different chunking strategies. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - - diff --git a/docs/academy/py/standalone/chunking/30_example_chunking.mdx b/docs/academy/py/standalone/chunking/30_example_chunking.mdx deleted file mode 100644 index 6a13977b4..000000000 --- a/docs/academy/py/standalone/chunking/30_example_chunking.mdx +++ /dev/null @@ -1,236 +0,0 @@ ---- -title: Example part 1 - Chunking -description: View practical examples of chunking applied in Weaviate. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import CodePracticalExample from '!!raw-loader!./_snippets/30_example.py'; - - - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## Overview - -In the preceding sections, you've learned [about chunking](./10_introduction.mdx), and how to do it, using [fixed size chunks](./20_how_1.mdx) or [variable size chunks](./25_how_2.mdx). - -In this section and next, we'll show you holistic examples of chunking in action. We'll ingest multiple sections of a book using different chunking methods before comparing how search performs. - -## Chunking real data - -You've already seen examples of chunking with a section of the [Pro Git book](https://git-scm.com/book/en/v2)*. In this section, we'll use multiple entire chapters of the book, in total containing 14 sections. - -### Source data - -Here, we retrieve the source data from the Pro Git book, using the `requests` library. We will save each section as a separate object, containing some metadata (chapter title and filename) as well as the text. - -:::tip This could be any data -The details here aren't important, as the source could be anything, such as a database, PDFs, or even videos. The important thing is to obtain a set of texts for us to try various chunking strategies. -::: - - - - - - - -This process yields 14 text objects - one for each section. Each one contains a chapter title, filename, and the text of the section. - -
- Take a look at some sample sections - -``` -{ - "body": "=== About Version Control\n\n(((version control)))\nWhat is \"`version control`\", and why should you care?\nVersion control is a system that records changes to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.\n\nIf you are a graphic or web designer and want to keep every version of an image or layout (which you would most certainly want to), a Version Control System (VCS) is a very wise thing to use.\nIt allows you to revert selected files back to a previous state, revert the entire project back to a previous state, compare changes over time, see who last modified something that might be causing a problem, who introduced an issue and when, and more.\nUsing a VCS also generally means that if you screw things up or lose files, you can easily recover.\nIn addition, you get all this for very little overhead.\n\n==== Local Version Control Systems\n\n(((version control,local)))\nMany people's version-control method of choice is to copy files into another directory (perhaps a time-stamped directory, if they're clever).\nThis approach is very common because it is so simple, but it is also incredibly error prone.\nIt is easy to forget which directory you're in and accidentally write to the wrong file or copy over files you don't mean to.\n\nTo deal with this issue, programmers long ago developed local VCSs that had a simple database that kept all the changes to files under revision control.\n\n.Local version control diagram\nimage::images/local.png[Local version control diagram]\n\nOne of the most popular VCS tools was a system called RCS, which is still distributed with many computers today.\nhttps://www.gnu.org/software/rcs/[RCS^] works by keeping patch sets (that is, the differences between files) in a special format on disk; it can then re-create what any file looked like at any point in time by adding up all the patches.\n\n==== Centralized Version Control Systems\n\n(((version control,centralized)))\nThe next major issue that people encounter is that they need to collaborate with developers on other systems.\nTo deal with this problem, Centralized Version Control Systems (CVCSs) were developed.\nThese systems (such as CVS, Subversion, and Perforce) have a single server that contains all the versioned files, and a number of clients that check out files from that central place.(((CVS)))(((Subversion)))(((Perforce)))\nFor many years, this has been the standard for version control.\n\n.Centralized version control diagram\nimage::images/centralized.png[Centralized version control diagram]\n\nThis setup offers many advantages, especially over local VCSs.\nFor example, everyone knows to a certain degree what everyone else on the project is doing.\nAdministrators have fine-grained control over who can do what, and it's far easier to administer a CVCS than it is to deal with local databases on every client.\n\nHowever, this setup also has some serious downsides.\nThe most obvious is the single point of failure that the centralized server represents.\nIf that server goes down for an hour, then during that hour nobody can collaborate at all or save versioned changes to anything they're working on.\nIf the hard disk the central database is on becomes corrupted, and proper backups haven't been kept, you lose absolutely everything -- the entire history of the project except whatever single snapshots people happen to have on their local machines.\nLocal VCSs suffer from this same problem -- whenever you have the entire history of the project in a single place, you risk losing everything.\n\n==== Distributed Version Control Systems\n\n(((version control,distributed)))\nThis is where Distributed Version Control Systems (DVCSs) step in.\nIn a DVCS (such as Git, Mercurial, Bazaar or Darcs), clients don't just check out the latest snapshot of the files; rather, they fully mirror the repository, including its full history.\nThus, if any server dies, and these systems were collaborating via that server, any of the client repositories can be copied back up to the server to restore it.\nEvery clone is really a full backup of all the data.\n\n.Distributed version control diagram\nimage::images/distributed.png[Distributed version control diagram]\n\nFurthermore, many of these systems deal pretty well with having several remote repositories they can work with, so you can collaborate with different groups of people in different ways simultaneously within the same project.\nThis allows you to set up several types of workflows that aren't possible in centralized systems, such as hierarchical models.\n", - "chapter_title": "01-introduction", - "filename": "about-version-control.asc" -} -``` - -
- -### Chunking - -Now, let's apply the following chunking methods on each section: - -- Fixed-length chunks (and 20% overlap) - - With 25 words per chunk, and - - With 100 words per chunk -- Variable-length chunks, using paragraph markers, and -- Mixed-strategy chunks, using paragraph markers and a minimum chunk length of 25 words. - -We'll also add metadata to the chunk, such as the filename and the chapter name, as well as the chunk number. - -One implementation is shown below: - -
- Helper functions - - - - - - - -
- - - - - - - -We now have four sets of chunks, according to the four chunking methods we used. Take a look: - - - - - - - - - - - - - - - - -### Import - -We can now import the chunks into Weaviate. To allow for easy comparisons, we'll import each set of chunks into the same collection, while adding a `chunking_method` property to each chunk for easy filtering. - - - - - - - -### Inspection - -Before we move on, let's check that the chunks were imported correctly. We'll retrieve the total count of objects, as well as object counts according to each chunking strategy. - - - - - - - -This should produce an output like this: - - - -These counts match the number of chunks we created, so we can be confident that the import was successful. - -In the next section, we'll try out some searches on these chunks. - -## Notes - -:::info Pro Git by Scott Chacon and Ben Straub - Book License - -*Available through the Creative Commons Attribution-Non Commercial-Share Alike 3.0 license. - -::: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - - \ No newline at end of file diff --git a/docs/academy/py/standalone/chunking/40_example_search.mdx b/docs/academy/py/standalone/chunking/40_example_search.mdx deleted file mode 100644 index d5e454f18..000000000 --- a/docs/academy/py/standalone/chunking/40_example_search.mdx +++ /dev/null @@ -1,325 +0,0 @@ ---- -title: Example part 2 - Search -description: Discover how to search chunked data in Weaviate efficiently. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import CodePracticalExample from '!!raw-loader!./_snippets/30_example.py'; - - - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## Overview - -In the [preceding section](./30_example_chunking.mdx), we imported multiple chapters of a book into Weaviate using different chunking techniques. They were: - -- Fixed-length chunks (and 20% overlap) - - With 25 words per chunk, and - - With 100 words per chunk -- Variable-length chunks, using paragraph markers, and -- Mixed-strategy chunks, using paragraph markers and a minimum chunk length of 25 words. - -Now, we will use Weaviate to search through the book and evaluate the impact of the chunking techniques. - -Since the data comes from the first two chapters of a book about Git, let's search for various git-related concepts and see how the different chunking strategies perform. - - -## Search / recall - -First of all, we'll retrieve information from our Weaviate instance using various search terms. We'll use a semantic search (`nearText`) to aim to retrieve the most relevant chunks. - -### Search syntax - -The search is carried out as follows, looping through each chunking strategy by filtering our dataset. We'll obtain a couple of top results for each search term. - - - - - - - -Using these search terms: -- `"history of git"` -- `"how to add the url of a remote repository"` - -### Results & discussions - -We get the following results: - -#### Example 1 - -:::info Results for a search for `"history of git"`. -::: - - - - - - - - - - - - - - - - -The query in this example is a broad one on the `history of git`. The result is that here, the longer chunks seem to perform better. - -Inspecting the result, we see that while the 25-word chunks may be semantically similar to the query `history of git`, they do not contain enough contextual information to enhance the readers' understanding of the topic. - -On the other hand, the paragraph chunks retrieved - especially those with a minimum length of 25 words - contain a good amount of holistic information that will teach the reader about the history of git. - -#### Example 2 - -:::info Results for a search for `"how to add the url of a remote repository"`. -::: - - - - - - - - - - - - - - - - -The query in this example was a more specific one, for example one that might be run by a user looking to identify how to add the url of a remote repository. - -In contrast to the first scenario, the 25-word chunks are more useful here. Because the question was very specific, Weaviate was able to identify the chunk containing the most suitable passage - how to add a remote repository (`git remote add `). - -While the other result sets also contain some of this information, it may be worth considering how the result may be used and displayed. The longer the result, the more cognitive effort it may take the user to identify the relevant information. - - -## Retrieval augmented generation (RAG) - -Next, let's take a look at the impact of chunking on RAG. - -We [discussed the relationship between chunk size and RAG earlier](./10_introduction.mdx#-for-optimal-retrieval-augmented-generation-rag). Using shorter chunks will allow you to include information from a wider range of source objects than longer chunks, but each object will not include as much contextual information. On the other hand, using longer chunks means each chunk will include more contextual information, but you will be limited to fewer source objects. - -Let's try a few RAG examples to see how this manifests itself. - -### Query syntax - -The query syntax is shown below. The syntax is largely the same as above, except for two aspects. - -One is that to account for varying chunk sizes, we will retrieve more chunks where the chunk size is smaller. - -The other is that the query has been modified to perform RAG, rather than a simple retrieval. The query asks the target LLM to summarize the results into point form. - - - - - - - -### Results & discussions - -#### Example 1 - -:::info Results for a search for `"history of git"`. -::: - - - - - - - - - - - - - - - - -The findings here are similar to the semantic search results. The longer chunks contain more information, and are more useful for a broad topic like the history of git. - -#### Example 2 - -:::info Results for a search for `"available git remote commands"`. -::: - - - - - - - - - - - - - - - - -The results of the generative search here for `available git remote commands` are perhaps even more illustrative than before. - -Here, the shortest chunks were able to retrieve the highest number of `git remote` commands from the book. This is because we were able to retrieve more chunks from various locations throughout the corpus (book). - -Contrast this result to the one where longer chunks are used. Here, using longer chunks, we were only able to retrieve one `git remote` command, because we retrieved fewer chunks than before. - -#### Discussions - -You see here the trade-off between using shorter and longer chunks. - -Using shorter chunks allows you to retrieve more information from more objects, but each object will contain less contextual information. On the other hand, using longer chunks allows you to retrieve less information from fewer objects, but each object will contain more contextual information. - -Even when using LLMs with very large context windows, this is something to keep in mind. Longer input texts means higher fees for the API use, or inference time. In other words, there are costs associated with using longer chunks. - -Often, this is *the* trade-off that you will need to consider when deciding on the chunking strategy for a RAG use-case. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - - diff --git a/docs/academy/py/standalone/chunking/50_considerations.mdx b/docs/academy/py/standalone/chunking/50_considerations.mdx deleted file mode 100644 index ee2786163..000000000 --- a/docs/academy/py/standalone/chunking/50_considerations.mdx +++ /dev/null @@ -1,171 +0,0 @@ ---- -title: Considerations & suggestions -description: Learn key considerations for chunking large datasets effectively. ---- - - - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## Overview - -We have covered a lot of ground on chunking in this unit already. - -You saw what chunking is and learned about different chunking methods, and dived into example implementations so that you can see their impact. - -In this section, we will take a step back out from the detailed, micro view to the high level, macro view, while using all that we've learned in context. More specifically, we will take a look at some considerations of what to think about when chunking data, and what it means for your Weaviate implementation. - -## Considerations - -As you have seen, there are many different ways to chunk data. But which one is right for you? - -The answer is, as always, "it depends". But here are some things to consider when choosing a chunking method: - -#### Text per search result - -How much text should each "hit" in your search results contain? Is it a sentence, or a paragraph, or something else? - -A natural fit would be to chunk the data into the same size as the desired search result. - -#### Input query length - -Consider what a typical input query might look like. Will it be short search strings, or longer texts, such as those extracted from a document? - -Keep in mind that the vector of the query will be compared to the vector of the chunks. So, it may be helpful to have shorter chunks for shorter queries, and longer chunks for longer queries. - -In cases where shorter chunks are used but further context would be beneficial, you could structure your app so that you return the chunk that contains the search term, and the surrounding chunks. - -#### Database size - -The larger the chunks, the fewer chunks there will be, and the smaller the database will be. This may be important if you are working with a large dataset. - -#### Model requirements - -You will need to ensure that the chunk size is within the model's maximum allowed size (context window). This goes for generating embeddings, as well as for RAG. - -#### RAG workflows - -As discussed earlier, shorter chunks will make it easier to include many chunks from a variety of sources, but may not provide enough context. Longer chunks will provide more context, but may not be able to include as many chunks. - -### Rule of thumb - -Having said all that, it may be helpful to have a rule of thumb to start with. We suggest starting with a chunk size of 100-150 words and going from there. - -Then, you can modify the chunk size based on the considerations above, and your observations on your app's performance. - -## Data modelling - -By definition, chunking your source data will mean creating multiple objects out of one source. - -Accordingly, you should consider how to model your data to capture the relationships between the chunks and the source data, as well as between chunks. This may help you to efficiently retrieve what you need, such as the metadata relating to the source, or surrounding chunks. - -### Collection definition examples - -Consider a Weaviate database designed to store data from a library of reference books. - -Storing each book as a vector may still be too large, so you may want to chunk the books into paragraphs. Having done so, you may want to create a `Book` collection, and a `Paragraph` collection, with the `Paragraph` collection having the cross-reference property `fromBook`. This will allow you to retrieve the book metadata from the `Book` collection, and the surrounding paragraphs from the `Paragraph` collection. - -So, for example, you may build a `Book` collection like this: - -```json -{ - "class": "Book", - "properties": [ - ... // other class properties - // highlight-start - { - "name": "title", - "dataType": ["text"], - }, - { - "name": "text", - "dataType": ["text"], - }, - // highlight-end - ], - "vectorIndexConfig": { - "skip": true - } - ... // other class attributes -} -``` - -And add a `Paragraph` collection like this, that references the `Book` collection: - -```json -{ - "class": "Paragraph", - "properties": [ - ... // other class properties - // highlight-start - { - "name": "body", - "dataType": ["Text"] - }, - { - "name": "chunk_number", - "dataType": ["int"] - }, - { - "name": "fromBook", - "dataType": ["Book"] - }, - // highlight-end - ], - ... // other class attributes (e.g. vectorizer) -} -``` - -Note that in this configuration, the `Book` collection is not vectorized, but the `Paragraph` collection is. This will allow the `Book` collection to be used for storage and retrieval of metadata, while the `Paragraph` collection is used for search. - -This is just one example of how you could model your data. You may want to experiment with different configurations to see what works best for your use case. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - - \ No newline at end of file diff --git a/docs/academy/py/standalone/chunking/90_wrap_up.mdx b/docs/academy/py/standalone/chunking/90_wrap_up.mdx deleted file mode 100644 index db0ae991d..000000000 --- a/docs/academy/py/standalone/chunking/90_wrap_up.mdx +++ /dev/null @@ -1,34 +0,0 @@ ---- -title: Wrap-up -description: Chunking Wrap-up and Summary ---- - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -## Unit review - -In this unit, you have learned about chunking, which is a technique of splitting up longer texts into smaller pieces of text, or "chunks". - -We covered how it can impact information retrieval using vector databases, and how it can affect the performance of retrieval augmented generation. - -Then, we then moved on to cover various chunking techniques including fixed-size chunking, variable-size chunking, and hybrid chunking. We also discussed key considerations when deciding on a chunking strategy, as well as some suggested starting points. - -The unit was rounded off with a discussion of some points of consideration when chunking data. These included the length of text per search result, the input query length, the size of the database, the requirements of the language model, and the RAG workflow. - -We hope that you now have a good understanding of chunking in general, and are able to implement some solid chunking strategies based on your actual needs. - -### Learning outcomes - -Having finished this unit, you should be able to: -- Describe what chunking is at a high level -- Explain the impact of chunking in vector search and generative search -- Implement various chunking methods and know where to explore others, and -- Evaluate chunking strategies based on your needs - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.1.fixed.size.py b/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.1.fixed.size.py deleted file mode 100644 index f58a67c08..000000000 --- a/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.1.fixed.size.py +++ /dev/null @@ -1,84 +0,0 @@ -# START Fixed size chunker with overlap # START Vanilla fixed size chunker # START Get fixed-size chunks examples -from typing import List - -# END Fixed size chunker with overlap # END Vanilla fixed size chunker # END Get fixed-size chunks examples - -# ======================================== -# ========== FIXED-SIZE CHUNKING ========= -# ======================================== - -# START Vanilla fixed size chunker -# Split the text into units (words, in this case) -def word_splitter(source_text: str) -> List[str]: - import re - source_text = re.sub("\s+", " ", source_text) # Replace multiple whitespces - return re.split("\s", source_text) # Split by single whitespace - -def get_chunks_fixed_size(text: str, chunk_size: int) -> List[str]: - text_words = word_splitter(text) - chunks = [] - for i in range(0, len(text_words), chunk_size): - chunk_words = text_words[i: i + chunk_size] - chunk = " ".join(chunk_words) - chunks.append(chunk) - return chunks -# END Vanilla fixed size chunker - -# START Fixed size chunker with overlap -# Split the text into units (words, in this case) -def word_splitter(source_text: str) -> List[str]: - import re - source_text = re.sub("\s+", " ", source_text) # Replace multiple whitespces - return re.split("\s", source_text) # Split by single whitespace - -def get_chunks_fixed_size_with_overlap(text: str, chunk_size: int, overlap_fraction: float) -> List[str]: - text_words = word_splitter(text) - overlap_int = int(chunk_size * overlap_fraction) - chunks = [] - for i in range(0, len(text_words), chunk_size): - chunk_words = text_words[max(i - overlap_int, 0): i + chunk_size] - chunk = " ".join(chunk_words) - chunks.append(chunk) - return chunks -# END Fixed size chunker with overlap - -# START Get fixed-size chunks examples -# Get source data -import requests - -url = "https://raw.githubusercontent.com/progit/progit2/main/book/01-introduction/sections/what-is-git.asc" -source_text = requests.get(url).text - -# Chunk text by number of words -for chosen_size in [5, 25, 100]: - chunks = get_chunks_fixed_size_with_overlap(source_text, chosen_size, overlap_fraction=0.2) - # Print outputs to screen - print(f"\nSize {chosen_size} - {len(chunks)} chunks returned.") - for i in range(3): - print(f"Chunk {i+1}: {chunks[i]}") -# END Get fixed-size chunks examples - - -""" -# START Chunking by 5 words - outputs -Size 5 - 281 chunks returned. -Chunk 1: [[what_is_git_section]] === What is Git? -Chunk 2: Git? So, what is Git in -Chunk 3: in a nutshell? This is an -# END Chunking by 5 words - outputs - -# START Chunking by 25 words - outputs -Size 25 - 57 chunks returned. -Chunk 1: [[what_is_git_section]] === What is Git? So, what is Git in a nutshell? This is an important section to absorb, because if you understand what Git -Chunk 2: if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you. As you learn Git, try to -Chunk 3: you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid -# END Chunking by 25 words - outputs - -# START Chunking by 100 words - outputs -Size 100 - 15 chunks returned. -Chunk 1: [[what_is_git_section]] === What is Git? So, what is Git in a nutshell? This is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you. As you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool. Even though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in -Chunk 2: tool. Even though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce))) ==== Snapshots, Not Differences The major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data. Conceptually, most other systems store information as a list of file-based changes. These other systems (CVS, Subversion, Perforce, Bazaar, and so on) think of the information they store as a set of files and the changes made to each file over time (this is commonly described as _delta-based_ version control). .Storing data as changes to a base version -Chunk 3: each file over time (this is commonly described as _delta-based_ version control). .Storing data as changes to a base version of each file image::images/deltas.png[Storing data as changes to a base version of each file] Git doesn't think of or store its data this way. Instead, Git thinks of its data more like a series of snapshots of a miniature filesystem. With Git, every time you commit, or save the state of your project, Git basically takes a picture of what all your files look like at that moment and stores a reference to that snapshot. To be efficient, if files have not changed, Git doesn't store the file again, just a link to the previous identical file it has already -# END Chunking by 100 words - outputs -""" - diff --git a/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.2.variable.size.py b/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.2.variable.size.py deleted file mode 100644 index 778d730a7..000000000 --- a/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.2.variable.size.py +++ /dev/null @@ -1,66 +0,0 @@ -# START Paragraph variable size chunker # START Asciidoc section variable size chunker -from typing import List - -# END Paragraph variable size chunker # END Asciidoc section variable size chunker - - -# ======================================== -# ======= VARIABLE-SIZE CHUNKING ========= -# ======================================== - -# START Paragraph variable size chunker -# Split the text into paragraphs -def get_chunks_by_paragraph(source_text: str) -> List[str]: - return source_text.split("\n\n") -# END Paragraph variable size chunker - -# START Asciidoc section variable size chunker -# Split the text by Asciidoc section markers -def get_chunks_by_asciidoc_sections(source_text: str) -> List[str]: - return source_text.split("\n==") -# END Asciidoc section variable size chunker - -# START Header variable size chunker -# Get source data -import requests - -url = "https://raw.githubusercontent.com/progit/progit2/main/book/01-introduction/sections/what-is-git.asc" -source_text = requests.get(url).text - -# Split the text into paragraphs -chunks = source_text.split("\n====") -# END Header variable size chunker - -# START Get variable-size chunks examples -# Get source data -import requests - -url = "https://raw.githubusercontent.com/progit/progit2/main/book/01-introduction/sections/what-is-git.asc" -source_text = requests.get(url).text - -# Chunk text by particular marker -for marker in ["\n\n", "\n=="]: - chunks = source_text.split(marker) - # Print outputs to screen - print(f"\nUsing the marker: {repr(marker)} - {len(chunks)} chunks returned.") - for i in range(3): - print(f"Chunk {i+1}: {repr(chunks[i])}") -# END Get variable-size chunks examples - - -""" -# START Chunking by paragraph - outputs -Using the marker: '\n\n' - 31 chunks returned. -Chunk 1: '[[what_is_git_section]]\n=== What is Git?' -Chunk 2: "So, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))" -Chunk 3: '==== Snapshots, Not Differences' -# END Chunking by paragraph - outputs - -# START Chunking by header - outputs -Using the marker: '\n==' - 7 chunks returned. -Chunk 1: '[[what_is_git_section]]' -Chunk 2: "= What is Git?\n\nSo, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))\n" -Chunk 3: "== Snapshots, Not Differences\n\nThe major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data.\nConceptually, most other systems store information as a list of file-based changes.\nThese other systems (CVS, Subversion, Perforce, Bazaar, and so on) think of the information they store as a set of files and the changes made to each file over time (this is commonly described as _delta-based_ version control).\n\n.Storing data as changes to a base version of each file\nimage::images/deltas.png[Storing data as changes to a base version of each file]\n\nGit doesn't think of or store its data this way.\nInstead, Git thinks of its data more like a series of snapshots of a miniature filesystem.\nWith Git, every time you commit, or save the state of your project, Git basically takes a picture of what all your files look like at that moment and stores a reference to that snapshot.\nTo be efficient, if files have not changed, Git doesn't store the file again, just a link to the previous identical file it has already stored.\nGit thinks about its data more like a *stream of snapshots*.\n\n.Storing data as snapshots of the project over time\nimage::images/snapshots.png[Git stores data as snapshots of the project over time]\n\nThis is an important distinction between Git and nearly all other VCSs.\nIt makes Git reconsider almost every aspect of version control that most other systems copied from the previous generation.\nThis makes Git more like a mini filesystem with some incredibly powerful tools built on top of it, rather than simply a VCS.\nWe'll explore some of the benefits you gain by thinking of your data this way when we cover Git branching in <>.\n" -# END Chunking by header - outputs -""" - diff --git a/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.3.mixed.strategy.py b/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.3.mixed.strategy.py deleted file mode 100644 index b7959d04e..000000000 --- a/docs/academy/py/standalone/chunking/_snippets/20_chunking_methods.3.mixed.strategy.py +++ /dev/null @@ -1,50 +0,0 @@ -# START Asciidoc and size based chunking -from typing import List - -# END Asciidoc and size based chunking - - -# ======================================== -# ======= MIXED-STRATEGY CHUNKING ========= -# ======================================== - -# START Asciidoc and size based chunking -# Get source data -import requests - -url = "https://raw.githubusercontent.com/progit/progit2/main/book/01-introduction/sections/what-is-git.asc" -source_text = requests.get(url).text - -# Split the text by Asciidoc marker -chunks = source_text.split("\n==") - -# Chunking -new_chunks = list() -chunk_buffer = "" -min_length = 25 - -for chunk in chunks: - new_buffer = chunk_buffer + chunk # Create new buffer - new_buffer_words = new_buffer.split(" ") # Split into words - if len(new_buffer_words) < min_length: # Check whether buffer length too small - chunk_buffer = new_buffer # Carry over to the next chunk - else: - new_chunks.append(new_buffer) # Add to chunks - chunk_buffer = "" - -if len(chunk_buffer) > 0: - new_chunks.append(chunk_buffer) # Add last chunk, if necessary - -# Print outputs to screen -for i in range(3): - print(f"Chunk {i+1}: {repr(new_chunks[i])}") -# END Asciidoc and size based chunking - - -""" -# START Mixed-strategy chunking output -Chunk 1: "[[what_is_git_section]]= What is Git?\n\nSo, what is Git in a nutshell?\nThis is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you.\nAs you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool.\nEven though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce)))\n" -Chunk 2: "== Snapshots, Not Differences\n\nThe major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data.\nConceptually, most other systems store information as a list of file-based changes.\nThese other systems (CVS, Subversion, Perforce, Bazaar, and so on) think of the information they store as a set of files and the changes made to each file over time (this is commonly described as _delta-based_ version control).\n\n.Storing data as changes to a base version of each file\nimage::images/deltas.png[Storing data as changes to a base version of each file]\n\nGit doesn't think of or store its data this way.\nInstead, Git thinks of its data more like a series of snapshots of a miniature filesystem.\nWith Git, every time you commit, or save the state of your project, Git basically takes a picture of what all your files look like at that moment and stores a reference to that snapshot.\nTo be efficient, if files have not changed, Git doesn't store the file again, just a link to the previous identical file it has already stored.\nGit thinks about its data more like a *stream of snapshots*.\n\n.Storing data as snapshots of the project over time\nimage::images/snapshots.png[Git stores data as snapshots of the project over time]\n\nThis is an important distinction between Git and nearly all other VCSs.\nIt makes Git reconsider almost every aspect of version control that most other systems copied from the previous generation.\nThis makes Git more like a mini filesystem with some incredibly powerful tools built on top of it, rather than simply a VCS.\nWe'll explore some of the benefits you gain by thinking of your data this way when we cover Git branching in <>.\n" -Chunk 3: "== Nearly Every Operation Is Local\n\nMost operations in Git need only local files and resources to operate -- generally no information is needed from another computer on your network.\nIf you're used to a CVCS where most operations have that network latency overhead, this aspect of Git will make you think that the gods of speed have blessed Git with unworldly powers.\nBecause you have the entire history of the project right there on your local disk, most operations seem almost instantaneous.\n\nFor example, to browse the history of the project, Git doesn't need to go out to the server to get the history and display it for you -- it simply reads it directly from your local database.\nThis means you see the project history almost instantly.\nIf you want to see the changes introduced between the current version of a file and the file a month ago, Git can look up the file a month ago and do a local difference calculation, instead of having to either ask a remote server to do it or pull an older version of the file from the remote server to do it locally.\n\nThis also means that there is very little you can't do if you're offline or off VPN.\nIf you get on an airplane or a train and want to do a little work, you can commit happily (to your _local_ copy, remember?) until you get to a network connection to upload.\nIf you go home and can't get your VPN client working properly, you can still work.\nIn many other systems, doing so is either impossible or painful.\nIn Perforce, for example, you can't do much when you aren't connected to the server; in Subversion and CVS, you can edit files, but you can't commit changes to your database (because your database is offline).\nThis may not seem like a huge deal, but you may be surprised what a big difference it can make.\n" -# END Mixed-strategy chunking output -""" \ No newline at end of file diff --git a/docs/academy/py/standalone/chunking/_snippets/30_example.py b/docs/academy/py/standalone/chunking/_snippets/30_example.py deleted file mode 100644 index 075d18b75..000000000 --- a/docs/academy/py/standalone/chunking/_snippets/30_example.py +++ /dev/null @@ -1,571 +0,0 @@ -# ============================ -# ======= CHUNK DATA ========= -# ============================ - -# START Get text objects from Pro Git book -def get_book_text_objects(): - import requests - - # Source location - text_objs = list() - api_base_url = 'https://api.github.com/repos/progit/progit2/contents/book' # Book base URL - chapter_urls = ['/01-introduction/sections', '/02-git-basics/sections'] # List of section URLs - - # Loop through book chapters - for chapter_url in chapter_urls: - response = requests.get(api_base_url + chapter_url) # Get the JSON data for the section files in the chapter - - # Loop through inner files (sections) - for file_info in response.json(): - if file_info['type'] == 'file': # Only process files (not directories) - file_response = requests.get(file_info['download_url']) - - # Build objects including metadata - chapter_title = file_info['download_url'].split('/')[-3] - filename = file_info['download_url'].split('/')[-1] - text_obj = { - "body": file_response.text, - "chapter_title": chapter_title, - "filename": filename - } - text_objs.append(text_obj) - return text_objs -# END Get text objects from Pro Git book - - -# START Get chunks - helper functions -from typing import List - -def word_splitter(source_text: str) -> List[str]: - import re - source_text = re.sub("\s+", " ", source_text) # Replace multiple whitespces - return re.split("\s", source_text) # Split by single whitespace - -def get_chunks_fixed_size_with_overlap(text: str, chunk_size: int, overlap_fraction: float) -> List[str]: - text_words = word_splitter(text) - overlap_int = int(chunk_size * overlap_fraction) - chunks = [] - for i in range(0, len(text_words), chunk_size): - chunk = " ".join(text_words[max(i - overlap_int, 0): i + chunk_size]) - chunks.append(chunk) - return chunks - -def get_chunks_by_paragraph(source_text: str) -> List[str]: - return source_text.split("\n\n") - -def get_chunks_by_paragraph_and_min_length(source_text: str) -> List[str]: - chunks = source_text.split("\n==") - - # Chunking - new_chunks = list() - chunk_buffer = "" - min_length = 25 - - for chunk in chunks: - new_buffer = chunk_buffer + chunk # Create new buffer - new_buffer_words = new_buffer.split(" ") # Split into words - if len(new_buffer_words) < min_length: # Check whether buffer length too small - chunk_buffer = new_buffer # Carry over to the next chunk - else: - new_chunks.append(new_buffer) # Add to chunks - chunk_buffer = "" - - if len(chunk_buffer) > 0: - new_chunks.append(chunk_buffer) # Add last chunk, if necessary - return new_chunks - -def build_chunk_objs(book_text_obj, chunks): - chunk_objs = list() - for i, c in enumerate(chunks): - chunk_obj = { - "chapter_title": book_text_obj["chapter_title"], - "filename": book_text_obj["filename"], - "chunk": c, - "chunk_index": i - } - chunk_objs.append(chunk_obj) - return chunk_objs -# END Get chunks - helper functions - -# START Get chunks - main body -book_text_objs = get_book_text_objects() - -# Get multiple sets of chunks - according to chunking strategy -chunk_obj_sets = dict() -for book_text_obj in book_text_objs: - text = book_text_obj["body"] # Get the object's text body - - # Loop through chunking strategies: - for strategy_name, chunks in [ - ["fixed_size_25", get_chunks_fixed_size_with_overlap(text, 25, 0.2)], - ["fixed_size_100", get_chunks_fixed_size_with_overlap(text, 100, 0.2)], - ["para_chunks", get_chunks_by_paragraph(text)], - ["para_chunks_min_25", get_chunks_by_paragraph_and_min_length(text)] - ]: - chunk_objs = build_chunk_objs(book_text_obj, chunks) - - if strategy_name not in chunk_obj_sets.keys(): - chunk_obj_sets[strategy_name] = list() - - chunk_obj_sets[strategy_name] += chunk_objs -# END Get chunks - main body - -""" -# START fixed_size_25 chunks -fixed_size_25 -655 chunks in total -Chunk 0: '=== About Version Control\n\n(((version control)))\nWhat is "`version control`", and why should you care?\nVersion control is a system that records changes to a file or set' -Chunk 1: 'to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the' -Chunk 2: 'software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.\n\nIf you are a graphic or' -# END fixed_size_25 chunks -# START fixed_size_100 chunks -fixed_size_100 -170 chunks in total -Chunk 0: '=== About Version Control\n\n(((version control)))\nWhat is "`version control`", and why should you care?\nVersion control is a system that records changes to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.\n\nIf you are a graphic or web designer and want to keep every version of an image or layout (which you would most certainly want to), a Version Control System (VCS)' -Chunk 1: "keep every version of an image or layout (which you would most certainly want to), a Version Control System (VCS) is a very wise thing to use.\nIt allows you to revert selected files back to a previous state, revert the entire project back to a previous state, compare changes over time, see who last modified something that might be causing a problem, who introduced an issue and when, and more.\nUsing a VCS also generally means that if you screw things up or lose files, you can easily recover.\nIn addition, you get all this for very little overhead.\n\n==== Local Version Control Systems\n\n(((version control,local)))\nMany people's version-control method of choice is to copy files into another directory (perhaps a time-stamped directory, if they're" -Chunk 2: "Systems\n\n(((version control,local)))\nMany people's version-control method of choice is to copy files into another directory (perhaps a time-stamped directory, if they're clever).\nThis approach is very common because it is so simple, but it is also incredibly error prone.\nIt is easy to forget which directory you're in and accidentally write to the wrong file or copy over files you don't mean to.\n\nTo deal with this issue, programmers long ago developed local VCSs that had a simple database that kept all the changes to files under revision control.\n\n.Local version control diagram\nimage::images/local.png[Local version control diagram]\n\nOne of the most popular VCS tools was a system called RCS, which is still distributed with many computers today.\nhttps://www.gnu.org/software/rcs/[RCS^] works by keeping patch sets (that is, the differences between" -# END fixed_size_100 chunks -# START para_chunks chunks -para_chunks -549 chunks in total -Chunk 0: '=== About Version Control' -Chunk 1: '(((version control)))\nWhat is "`version control`", and why should you care?\nVersion control is a system that records changes to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.' -Chunk 2: 'If you are a graphic or web designer and want to keep every version of an image or layout (which you would most certainly want to), a Version Control System (VCS) is a very wise thing to use.\nIt allows you to revert selected files back to a previous state, revert the entire project back to a previous state, compare changes over time, see who last modified something that might be causing a problem, who introduced an issue and when, and more.\nUsing a VCS also generally means that if you screw things up or lose files, you can easily recover.\nIn addition, you get all this for very little overhead.' -# END para_chunks chunks -# START para_chunks_min_25 chunks -para_chunks_min_25 -93 chunks in total -Chunk 0: '=== About Version Control\n\n(((version control)))\nWhat is "`version control`", and why should you care?\nVersion control is a system that records changes to a file or set of files over time so that you can recall specific versions later.\nFor the examples in this book, you will use software source code as the files being version controlled, though in reality you can do this with nearly any type of file on a computer.\n\nIf you are a graphic or web designer and want to keep every version of an image or layout (which you would most certainly want to), a Version Control System (VCS) is a very wise thing to use.\nIt allows you to revert selected files back to a previous state, revert the entire project back to a previous state, compare changes over time, see who last modified something that might be causing a problem, who introduced an issue and when, and more.\nUsing a VCS also generally means that if you screw things up or lose files, you can easily recover.\nIn addition, you get all this for very little overhead.\n' -Chunk 1: "== Local Version Control Systems\n\n(((version control,local)))\nMany people's version-control method of choice is to copy files into another directory (perhaps a time-stamped directory, if they're clever).\nThis approach is very common because it is so simple, but it is also incredibly error prone.\nIt is easy to forget which directory you're in and accidentally write to the wrong file or copy over files you don't mean to.\n\nTo deal with this issue, programmers long ago developed local VCSs that had a simple database that kept all the changes to files under revision control.\n\n.Local version control diagram\nimage::images/local.png[Local version control diagram]\n\nOne of the most popular VCS tools was a system called RCS, which is still distributed with many computers today.\nhttps://www.gnu.org/software/rcs/[RCS^] works by keeping patch sets (that is, the differences between files) in a special format on disk; it can then re-create what any file looked like at any point in time by adding up all the patches.\n" -Chunk 2: "== Centralized Version Control Systems\n\n(((version control,centralized)))\nThe next major issue that people encounter is that they need to collaborate with developers on other systems.\nTo deal with this problem, Centralized Version Control Systems (CVCSs) were developed.\nThese systems (such as CVS, Subversion, and Perforce) have a single server that contains all the versioned files, and a number of clients that check out files from that central place.(((CVS)))(((Subversion)))(((Perforce)))\nFor many years, this has been the standard for version control.\n\n.Centralized version control diagram\nimage::images/centralized.png[Centralized version control diagram]\n\nThis setup offers many advantages, especially over local VCSs.\nFor example, everyone knows to a certain degree what everyone else on the project is doing.\nAdministrators have fine-grained control over who can do what, and it's far easier to administer a CVCS than it is to deal with local databases on every client.\n\nHowever, this setup also has some serious downsides.\nThe most obvious is the single point of failure that the centralized server represents.\nIf that server goes down for an hour, then during that hour nobody can collaborate at all or save versioned changes to anything they're working on.\nIf the hard disk the central database is on becomes corrupted, and proper backups haven't been kept, you lose absolutely everything -- the entire history of the project except whatever single snapshots people happen to have on their local machines.\nLocal VCSs suffer from this same problem -- whenever you have the entire history of the project in a single place, you risk losing everything.\n" -# END para_chunks_min_25 chunks -""" - -# ===================================== -# ======= IMPORT CHUNKED DATA ========= -# ===================================== - - -# START import chunks -import weaviate -import os -from weaviate.util import generate_uuid5 - -client = weaviate.Client( - "http://localhost:8080", - additional_headers={ - "X-OpenAI-Api-Key": os.environ["OPENAI_APIKEY"], - } -) - -chunk_collection_definition = { - "class": "Chunk", - "vectorizer": "text2vec-openai", - "moduleConfig": { - "generative-openai": {} - }, - "properties": [ - { - "name": "chunk", - "dataType": ["text"], - }, - { - "name": "chapter_title", - "dataType": ["text"], - }, - { - "name": "filename", - "dataType": ["text"], - }, - { - "name": "chunking_strategy", - "dataType": ["text"], - "tokenization": "field", - } - ] -} -client.schema.create_class(chunk_collection_definition) - -with client.batch as batch: - for chunking_strategy, chunk_objects in chunk_obj_sets.items(): - for chunk_obj in chunk_objects: - chunk_obj["chunking_strategy"] = chunking_strategy - batch.add_data_object( - data_object=chunk_obj, - class_name="Chunk", - uuid=generate_uuid5(chunk_obj) - ) -# END import chunks - -# START inspection -print("Total count:") -print(client.query.aggregate("Chunk").with_meta_count().do()) # Get a total count -for chunking_strategy in chunk_obj_sets.keys(): - where_filter = { - "path": ["chunking_strategy"], - "operator": "Equal", - "valueText": chunking_strategy - } - print(f"Object count for {chunking_strategy}") - strategy_count = ( - client.query.aggregate("Chunk") - .with_where(where_filter) - .with_meta_count().do() - ) - print(strategy_count) # Get a count for each strategy -# END inspection - -""" -# START Inspection output -Total count: -{'data': {'Aggregate': {'Chunk': [{'meta': {'count': 1487}}]}}} -Object count for fixed_size_25 -{'data': {'Aggregate': {'Chunk': [{'meta': {'count': 672}}]}}} -Object count for fixed_size_100 -{'data': {'Aggregate': {'Chunk': [{'meta': {'count': 173}}]}}} -Object count for para_chunks -{'data': {'Aggregate': {'Chunk': [{'meta': {'count': 549}}]}}} -Object count for para_chunks_min_25 -{'data': {'Aggregate': {'Chunk': [{'meta': {'count': 93}}]}}} -# END Inspection output -""" - - -# =============================== -# ======= VECTOR SEARCH ========= -# =============================== - -def parse_result(response_object): - return response_object["data"]["Get"]["Chunk"] - -# START vector_search -search_string = "history of git" # Or "available git remote commands" - -for chunking_strategy in chunk_obj_sets.keys(): - where_filter = { - "path": ["chunking_strategy"], - "operator": "Equal", - "valueText": chunking_strategy - } - # END vector_search - print(f'\n{"="*20}') - print(f"Retrieved objects for {chunking_strategy}") - # START vector_search - response = ( - client.query.get("Chunk", ["chunk"]) - .with_near_text({"concepts": [search_string]}) - .with_where(where_filter) - .with_limit(2) - .do() - ) - # END vector_search - for i, chunk_obj in enumerate(parse_result(response)): - print(f'{"="*5} Object {i} {"="*5}') - print(chunk_obj["chunk"]) -# END vector_search - - - -""" -# START fixed_size_25 vector_search_history -==================== -Retrieved objects for fixed_size_25 -===== Object 0 ===== -=== A Short History of Git As with many great things in life, Git began with a bit of creative destruction and fiery controversy. The -===== Object 1 ===== -kernel efficiently (speed and data size) Since its birth in 2005, Git has evolved and matured to be easy to use and yet retain these initial qualities. It's amazingly fast, -# END fixed_size_25 vector_search_history - -# START fixed_size_100 vector_search_history -==================== -Retrieved objects for fixed_size_100 -===== Object 0 ===== -=== A Short History of Git As with many great things in life, Git began with a bit of creative destruction and fiery controversy. The Linux kernel is an open source software project of fairly large scope.(((Linux))) During the early years of the Linux kernel maintenance (1991–2002), changes to the software were passed around as patches and archived files. In 2002, the Linux kernel project began using a proprietary DVCS called BitKeeper.(((BitKeeper))) In 2005, the relationship between the community that developed the Linux kernel and the commercial company that developed BitKeeper broke down, and the tool's free-of-charge status was revoked. -===== Object 1 ===== -2005, Git has evolved and matured to be easy to use and yet retain these initial qualities. It's amazingly fast, it's very efficient with large projects, and it has an incredible branching system for non-linear development (see <>). -# END fixed_size_100 vector_search_history - -# START para_chunks vector_search_history -==================== -Retrieved objects for para_chunks -===== Object 0 ===== -Since its birth in 2005, Git has evolved and matured to be easy to use and yet retain these initial qualities. -It's amazingly fast, it's very efficient with large projects, and it has an incredible branching system for non-linear development (see <>). - -===== Object 1 ===== -As with many great things in life, Git began with a bit of creative destruction and fiery controversy. -# END para_chunks vector_search_history - -# START para_chunks_min_25 vector_search_history -==================== -Retrieved objects for para_chunks_min_25 -===== Object 0 ===== -=== A Short History of Git - -As with many great things in life, Git began with a bit of creative destruction and fiery controversy. - -The Linux kernel is an open source software project of fairly large scope.(((Linux))) -During the early years of the Linux kernel maintenance (1991–2002), changes to the software were passed around as patches and archived files. -In 2002, the Linux kernel project began using a proprietary DVCS called BitKeeper.(((BitKeeper))) - -In 2005, the relationship between the community that developed the Linux kernel and the commercial company that developed BitKeeper broke down, and the tool's free-of-charge status was revoked. -This prompted the Linux development community (and in particular Linus Torvalds, the creator of Linux) to develop their own tool based on some of the lessons they learned while using BitKeeper.(((Linus Torvalds))) -Some of the goals of the new system were as follows: - -* Speed -* Simple design -* Strong support for non-linear development (thousands of parallel branches) -* Fully distributed -* Able to handle large projects like the Linux kernel efficiently (speed and data size) - -Since its birth in 2005, Git has evolved and matured to be easy to use and yet retain these initial qualities. -It's amazingly fast, it's very efficient with large projects, and it has an incredible branching system for non-linear development (see <>). - -===== Object 1 ===== -== Nearly Every Operation Is Local - -Most operations in Git need only local files and resources to operate -- generally no information is needed from another computer on your network. -If you're used to a CVCS where most operations have that network latency overhead, this aspect of Git will make you think that the gods of speed have blessed Git with unworldly powers. -Because you have the entire history of the project right there on your local disk, most operations seem almost instantaneous. - -For example, to browse the history of the project, Git doesn't need to go out to the server to get the history and display it for you -- it simply reads it directly from your local database. -This means you see the project history almost instantly. -If you want to see the changes introduced between the current version of a file and the file a month ago, Git can look up the file a month ago and do a local difference calculation, instead of having to either ask a remote server to do it or pull an older version of the file from the remote server to do it locally. - -This also means that there is very little you can't do if you're offline or off VPN. -If you get on an airplane or a train and want to do a little work, you can commit happily (to your _local_ copy, remember?) until you get to a network connection to upload. -If you go home and can't get your VPN client working properly, you can still work. -In many other systems, doing so is either impossible or painful. -In Perforce, for example, you can't do much when you aren't connected to the server; in Subversion and CVS, you can edit files, but you can't commit changes to your database (because your database is offline). -This may not seem like a huge deal, but you may be surprised what a big difference it can make. -# END para_chunks_min_25 vector_search_history -""" - -""" -# START fixed_size_25 vector_search_remote_repo -==================== -Retrieved objects for fixed_size_25 -===== Object 0 ===== -remote))) To add a new remote Git repository as a shortname you can reference easily, run `git remote add `: [source,console] ---- $ git remote origin $ git remote -===== Object 1 ===== -to and from them when you need to share work. Managing remote repositories includes knowing how to add remote repositories, remove remotes that are no longer valid, manage various remote -# END fixed_size_25 vector_search_remote_repo - -# START fixed_size_100 vector_search_remote_repo -==================== -Retrieved objects for fixed_size_100 -===== Object 0 ===== -adds the `origin` remote for you. Here's how to add a new remote explicitly.(((git commands, remote))) To add a new remote Git repository as a shortname you can reference easily, run `git remote add `: [source,console] ---- $ git remote origin $ git remote add pb https://github.com/paulboone/ticgit $ git remote -v origin https://github.com/schacon/ticgit (fetch) origin https://github.com/schacon/ticgit (push) pb https://github.com/paulboone/ticgit (fetch) pb https://github.com/paulboone/ticgit (push) ---- Now you can use the string `pb` on the command line in lieu of the whole URL. For example, if you want to fetch all the information that Paul has but that you don't yet have in your repository, you can run `git fetch pb`: [source,console] ---- $ git fetch pb remote: Counting objects: 43, -===== Object 1 ===== -Managing remote repositories includes knowing how to add remote repositories, remove remotes that are no longer valid, manage various remote branches and define them as being tracked or not, and more. In this section, we'll cover some of these remote-management skills. [NOTE] .Remote repositories can be on your local machine. ==== It is entirely possible that you can be working with a "`remote`" repository that is, in fact, on the same host you are. The word "`remote`" does not necessarily imply that the repository is somewhere else on the network or Internet, only that it is elsewhere. Working with such a remote repository would still involve all the standard pushing, pulling and fetching operations as with any other remote. ==== -# END fixed_size_100 vector_search_remote_repo - -# START para_chunks vector_search_remote_repo -==================== -Retrieved objects for para_chunks -===== Object 0 ===== -We've mentioned and given some demonstrations of how the `git clone` command implicitly adds the `origin` remote for you. -Here's how to add a new remote explicitly.(((git commands, remote))) -To add a new remote Git repository as a shortname you can reference easily, run `git remote add `: -===== Object 1 ===== -To be able to collaborate on any Git project, you need to know how to manage your remote repositories. -Remote repositories are versions of your project that are hosted on the Internet or network somewhere. -You can have several of them, each of which generally is either read-only or read/write for you. -Collaborating with others involves managing these remote repositories and pushing and pulling data to and from them when you need to share work. -Managing remote repositories includes knowing how to add remote repositories, remove remotes that are no longer valid, manage various remote branches and define them as being tracked or not, and more. -In this section, we'll cover some of these remote-management skills. -# END para_chunks vector_search_remote_repo - -# START para_chunks_min_25 vector_search_remote_repo -==================== -Retrieved objects for para_chunks_min_25 -===== Object 0 ===== -== Adding Remote Repositories - -We've mentioned and given some demonstrations of how the `git clone` command implicitly adds the `origin` remote for you. -Here's how to add a new remote explicitly.(((git commands, remote))) -To add a new remote Git repository as a shortname you can reference easily, run `git remote add `: - -[source,console] ----- -$ git remote -origin -$ git remote add pb https://github.com/paulboone/ticgit -$ git remote -v -origin https://github.com/schacon/ticgit (fetch) -origin https://github.com/schacon/ticgit (push) -pb https://github.com/paulboone/ticgit (fetch) -pb https://github.com/paulboone/ticgit (push) ----- - -Now you can use the string `pb` on the command line in lieu of the whole URL. -For example, if you want to fetch all the information that Paul has but that you don't yet have in your repository, you can run `git fetch pb`: - -[source,console] ----- -$ git fetch pb -remote: Counting objects: 43, done. -remote: Compressing objects: 100% (36/36), done. -remote: Total 43 (delta 10), reused 31 (delta 5) -Unpacking objects: 100% (43/43), done. -From https://github.com/paulboone/ticgit - * [new branch] master -> pb/master - * [new branch] ticgit -> pb/ticgit ----- - -Paul's `master` branch is now accessible locally as `pb/master` -- you can merge it into one of your branches, or you can check out a local branch at that point if you want to inspect it. -We'll go over what branches are and how to use them in much more detail in <>. - -[[_fetching_and_pulling]] -===== Object 1 ===== -[[_remote_repos]]= Working with Remotes - -To be able to collaborate on any Git project, you need to know how to manage your remote repositories. -Remote repositories are versions of your project that are hosted on the Internet or network somewhere. -You can have several of them, each of which generally is either read-only or read/write for you. -Collaborating with others involves managing these remote repositories and pushing and pulling data to and from them when you need to share work. -Managing remote repositories includes knowing how to add remote repositories, remove remotes that are no longer valid, manage various remote branches and define them as being tracked or not, and more. -In this section, we'll cover some of these remote-management skills. - -[NOTE] -.Remote repositories can be on your local machine. -# END para_chunks_min_25 vector_search_remote_repo -""" - - -# =================================== -# ======= Retreval augmented generation ========= -# =================================== - -# START generative_search -# Set number of chunks to retrieve to compensate for different chunk sizes -n_chunks_by_strat = dict() -# Grab more of shorter chunks -n_chunks_by_strat['fixed_size_25'] = 8 -n_chunks_by_strat['para_chunks'] = 8 -# Grab fewer of longer chunks -n_chunks_by_strat['fixed_size_100'] = 2 -n_chunks_by_strat['para_chunks_min_25'] = 2 - -# Perform Retreval augmented generation -# highlight-start -search_string = "history of git" # Or "available git remote commands" -# highlight-end - -for chunking_strategy in chunk_obj_sets.keys(): - where_filter = { - "path": ["chunking_strategy"], - "operator": "Equal", - "valueText": chunking_strategy - } - # END generative_search - print(f'\n{"="*20}') - print(f"Generated text for {chunking_strategy}") - # START generative_search - response = ( - client.query.get("Chunk", ["chunk"]) - .with_near_text({"concepts": [search_string]}) - # highlight-start - .with_generate( - grouped_task=f"Using this information, please explain {search_string} in a few short points" - ) - # highlight-end - .with_where(where_filter) - # highlight-start - .with_limit(n_chunks_by_strat[chunking_strategy]) # Variable number of chunks retrieved - # highlight-end - .do() - ) - # END generative_search - print(response["data"]["Get"]["Chunk"][0]["_additional"]["generate"]["groupedResult"]) -# END generative_search - - -""" -==================== -# START fixed_size_25 generative_search_git_history -Generated text for fixed_size_25 -- Git was created in 2005 as a result of creative destruction and controversy. -- It was designed to handle the Linux kernel efficiently in terms of speed and data size. -- Over time, Git has evolved to be easy to use while retaining its initial qualities. -- Git reconsiders many aspects of version control, making it more like a mini filesystem with powerful tools. -- Git stores the entire history of a project locally, allowing for fast and instantaneous operations. -# END fixed_size_25 generative_search_git_history - -==================== -# START fixed_size_100 generative_search_git_history -Generated text for fixed_size_100 -- In the early years of the Linux kernel maintenance (1991-2002), changes to the software were passed around as patches and archived files. -- In 2002, the Linux kernel project started using a proprietary DVCS called BitKeeper. -- In 2005, the relationship between the Linux kernel community and the company behind BitKeeper broke down, leading to the revocation of the tool's free-of-charge status. -- Since then, Git has evolved and matured, becoming easy to use while retaining its initial qualities. It is known for its speed, efficiency with large projects, and its powerful branching system for non-linear development. -# END fixed_size_100 generative_search_git_history - -==================== -# START para_chunks generative_search_git_history -Generated text for para_chunks -- Git was created in 2005 and has since evolved and matured to be easy to use and efficient with large projects. -- Git has an incredibly fast performance and a powerful branching system for non-linear development. -- Git began with controversy and creative destruction. -- Git is fundamentally different from other version control systems (VCS) in the way it thinks about and stores data. -- Git operates mostly on local files and resources, making operations fast and efficient. -- Git has integrity and ensures the integrity of its data. -- Git is more like a mini filesystem with powerful tools built on top of it, rather than just a VCS. -# END para_chunks generative_search_git_history -==================== - -# START para_chunks_min_25 generative_search_git_history -Generated text for para_chunks_min_25 -- Git was created in 2005 by the Linux development community, led by Linus Torvalds, after the breakdown of their relationship with the proprietary DVCS called BitKeeper. -- The goals of Git were to be fast, have a simple design, support non-linear development with thousands of parallel branches, be fully distributed, and handle large projects efficiently. -- Git has evolved and matured since its creation, becoming easy to use while retaining its initial qualities. -- One of the key advantages of Git is that nearly every operation is local, meaning that most operations can be performed without needing information from another computer on the network. -- This local nature of Git allows for fast and instantaneous operations, such as browsing the project history or comparing file versions. -- Being able to work offline or off VPN is also a significant advantage of Git, as it allows users to continue working and committing changes to their local copy until they have a network connection to upload. -# END para_chunks_min_25 generative_search_git_history -""" - - - -""" -==================== -# START fixed_size_25 generative_search_git_remote -Generated text for fixed_size_25 -- `git fetch `: This command retrieves data from the remote repository specified by ``. -- `git remote show `: Running this command with a specific shortname, such as `origin`, displays information about the remote repository, including its branches and configuration. -- `git remote`: This command lists all the remote servers that have been configured for the repository. -- `git remote -v`: Similar to `git remote`, this command lists all the remote servers along with their URLs for fetching and pushing. -- `git clone`: This command is used to create a local copy of a remote repository. By default, it sets up the local `master` branch to track the remote repository's `master` branch. -- `git remote add `: This command adds a new remote repository with the specified `` and ``. This allows you to easily fetch and push changes to and from the remote repository. -- `git remote remove `: This command removes the remote repository with the specified `` from the local repository. -# END fixed_size_25 generative_search_git_remote - -==================== -# START fixed_size_100 generative_search_git_remote -Generated text for fixed_size_100 -- The `git remote` command is used to see which remote servers are configured for the repository. It lists the shortnames of each remote handle that has been specified. -- The `git remote -v` command can be used to display more detailed information about the remote repositories, including the URLs for fetching and pushing. -- The `git clone` command automatically adds the `origin` remote when cloning a repository. -- To add a new remote explicitly, the `git remote add ` command can be used. This allows for pulling and pushing to the specified remote repository. -# END fixed_size_100 generative_search_git_remote - -==================== -# START para_chunks generative_search_git_remote -Generated text for para_chunks -- The `git remote` command lists the shortnames of each remote handle that you have configured. -- The `git remote show ` command provides more information about a particular remote. -- The `git remote -v` command shows the URLs associated with each remote. -- The `git remote add ` command adds a new remote Git repository with a specified shortname and URL. -- The `git remote` command can be used to show all the remotes associated with a repository. -# END para_chunks generative_search_git_remote -==================== - -# START para_chunks_min_25 generative_search_git_remote -Generated text for para_chunks_min_25 -- The `git remote` command is used to see which remote servers you have configured. It lists the shortnames of each remote handle you've specified. -- The `git remote -v` command shows the URLs that Git has stored for the shortname to be used when reading and writing to that remote. -- The `git remote show ` command provides more information about a particular remote, including the URL for the remote repository, tracking branch information, and details about branches that can be automatically merged or pushed to. -# END para_chunks_min_25 generative_search_git_remote -""" - -# TODO - needs tests diff --git a/docs/academy/py/standalone/chunking/_snippets/what-is-git.asc b/docs/academy/py/standalone/chunking/_snippets/what-is-git.asc deleted file mode 100644 index 8953b9d48..000000000 --- a/docs/academy/py/standalone/chunking/_snippets/what-is-git.asc +++ /dev/null @@ -1,109 +0,0 @@ -[[what_is_git_section]] -=== What is Git? - -So, what is Git in a nutshell? -This is an important section to absorb, because if you understand what Git is and the fundamentals of how it works, then using Git effectively will probably be much easier for you. -As you learn Git, try to clear your mind of the things you may know about other VCSs, such as CVS, Subversion or Perforce -- doing so will help you avoid subtle confusion when using the tool. -Even though Git's user interface is fairly similar to these other VCSs, Git stores and thinks about information in a very different way, and understanding these differences will help you avoid becoming confused while using it.(((Subversion)))(((Perforce))) - -==== Snapshots, Not Differences - -The major difference between Git and any other VCS (Subversion and friends included) is the way Git thinks about its data. -Conceptually, most other systems store information as a list of file-based changes. -These other systems (CVS, Subversion, Perforce, Bazaar, and so on) think of the information they store as a set of files and the changes made to each file over time (this is commonly described as _delta-based_ version control). - -.Storing data as changes to a base version of each file -image::images/deltas.png[Storing data as changes to a base version of each file] - -Git doesn't think of or store its data this way. -Instead, Git thinks of its data more like a series of snapshots of a miniature filesystem. -With Git, every time you commit, or save the state of your project, Git basically takes a picture of what all your files look like at that moment and stores a reference to that snapshot. -To be efficient, if files have not changed, Git doesn't store the file again, just a link to the previous identical file it has already stored. -Git thinks about its data more like a *stream of snapshots*. - -.Storing data as snapshots of the project over time -image::images/snapshots.png[Git stores data as snapshots of the project over time] - -This is an important distinction between Git and nearly all other VCSs. -It makes Git reconsider almost every aspect of version control that most other systems copied from the previous generation. -This makes Git more like a mini filesystem with some incredibly powerful tools built on top of it, rather than simply a VCS. -We'll explore some of the benefits you gain by thinking of your data this way when we cover Git branching in <>. - -==== Nearly Every Operation Is Local - -Most operations in Git need only local files and resources to operate -- generally no information is needed from another computer on your network. -If you're used to a CVCS where most operations have that network latency overhead, this aspect of Git will make you think that the gods of speed have blessed Git with unworldly powers. -Because you have the entire history of the project right there on your local disk, most operations seem almost instantaneous. - -For example, to browse the history of the project, Git doesn't need to go out to the server to get the history and display it for you -- it simply reads it directly from your local database. -This means you see the project history almost instantly. -If you want to see the changes introduced between the current version of a file and the file a month ago, Git can look up the file a month ago and do a local difference calculation, instead of having to either ask a remote server to do it or pull an older version of the file from the remote server to do it locally. - -This also means that there is very little you can't do if you're offline or off VPN. -If you get on an airplane or a train and want to do a little work, you can commit happily (to your _local_ copy, remember?) until you get to a network connection to upload. -If you go home and can't get your VPN client working properly, you can still work. -In many other systems, doing so is either impossible or painful. -In Perforce, for example, you can't do much when you aren't connected to the server; in Subversion and CVS, you can edit files, but you can't commit changes to your database (because your database is offline). -This may not seem like a huge deal, but you may be surprised what a big difference it can make. - -==== Git Has Integrity - -Everything in Git is checksummed before it is stored and is then referred to by that checksum. -This means it's impossible to change the contents of any file or directory without Git knowing about it. -This functionality is built into Git at the lowest levels and is integral to its philosophy. -You can't lose information in transit or get file corruption without Git being able to detect it. - -The mechanism that Git uses for this checksumming is called a SHA-1 hash.(((SHA-1))) -This is a 40-character string composed of hexadecimal characters (0–9 and a–f) and calculated based on the contents of a file or directory structure in Git. -A SHA-1 hash looks something like this: - -[source] ----- -24b9da6552252987aa493b52f8696cd6d3b00373 ----- - -You will see these hash values all over the place in Git because it uses them so much. -In fact, Git stores everything in its database not by file name but by the hash value of its contents. - -==== Git Generally Only Adds Data - -When you do actions in Git, nearly all of them only _add_ data to the Git database. -It is hard to get the system to do anything that is not undoable or to make it erase data in any way. -As with any VCS, you can lose or mess up changes you haven't committed yet, but after you commit a snapshot into Git, it is very difficult to lose, especially if you regularly push your database to another repository. - -This makes using Git a joy because we know we can experiment without the danger of severely screwing things up. -For a more in-depth look at how Git stores its data and how you can recover data that seems lost, see <>. - -==== The Three States - -Pay attention now -- here is the main thing to remember about Git if you want the rest of your learning process to go smoothly. -Git has three main states that your files can reside in: _modified_, _staged_, and _committed_: - -* Modified means that you have changed the file but have not committed it to your database yet. -* Staged means that you have marked a modified file in its current version to go into your next commit snapshot. -* Committed means that the data is safely stored in your local database. - -This leads us to the three main sections of a Git project: the working tree, the staging area, and the Git directory. - -.Working tree, staging area, and Git directory -image::images/areas.png["Working tree, staging area, and Git directory"] - -The working tree is a single checkout of one version of the project. -These files are pulled out of the compressed database in the Git directory and placed on disk for you to use or modify. - -The staging area is a file, generally contained in your Git directory, that stores information about what will go into your next commit. -Its technical name in Git parlance is the "`index`", but the phrase "`staging area`" works just as well. - -The Git directory is where Git stores the metadata and object database for your project. -This is the most important part of Git, and it is what is copied when you _clone_ a repository from another computer. - -The basic Git workflow goes something like this: - -1. You modify files in your working tree. -2. You selectively stage just those changes you want to be part of your next commit, which adds _only_ those changes to the staging area. -3. You do a commit, which takes the files as they are in the staging area and stores that snapshot permanently to your Git directory. - -If a particular version of a file is in the Git directory, it's considered _committed_. -If it has been modified and was added to the staging area, it is _staged_. -And if it was changed since it was checked out but has not been staged, it is _modified_. -In <>, you'll learn more about these states and how you can either take advantage of them or skip the staged part entirely. diff --git a/docs/academy/py/standalone/chunking/index.mdx b/docs/academy/py/standalone/chunking/index.mdx deleted file mode 100644 index 0d796c879..000000000 --- a/docs/academy/py/standalone/chunking/index.mdx +++ /dev/null @@ -1,58 +0,0 @@ ---- -title: Chunking long texts -description: "Explore data chunking in Weaviate standalone for optimized storage and performance in Python." -sidebar_position: 101 # Like a subject number (e.g. CS101) ---- - -## Unit overview - -import PreviewUnit from '../../../_snippets/preview.mdx' - - - -import ReactPlayer from 'react-player/lazy' - - -
- - - - - -Chunking is an important concept in the world of vector databases and language models. Although we've looked at relatively small pieces of text in previous units, real-world text data can be much longer. - -Think about lengths of articles, transcripts, or even books. Instead of a few words, these texts can be thousands, or tens of thousands of words long if not longer. *The Lord of the Rings*, for example, is over 500,000 words long! - -Chunking splits texts like these into smaller pieces of texts, i.e. "chunks", before storing them in a vector database, or passing them to a language model. - -This can seem relatively innocuous at first, like deciding where to split a sentence or a paragraph into two. But chunking decisions can significantly impact the search performance and behavior of vector databases as well as the output from a language model. This unit covers this seemingly simple, nuanced topic from the perspective of a user. - -We will begin by covering what chunking is, and why it is used. Then, we will move on to cover various chunking methods before discussing key considerations when deciding on a chunking strategy, as well as some suggested starting points. - -By the end of this unit, you will have a good understanding of chunking in general, and be able to implement some solid chunking strategies based on your actual needs. - -### Prerequisites - -- (**Required**) A Python (3) environment with `weaviate-client` installed. -- (**Required**) Complete [101A Weaviate Academy Preparation](../../zero_to_mvp/setup.mdx) -- (*Recommended*) Complete [Hello, Weaviate](../../zero_to_mvp/101_hello_weaviate/index.mdx) -- (*Recommended*) Complete [Queries 1](../../zero_to_mvp/102_queries_1/index.mdx) -- (*Recommended*) Complete [Schema and Imports](../../zero_to_mvp/103_schema_and_imports/index.mdx) - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/standalone/index.md b/docs/academy/py/standalone/index.md deleted file mode 100644 index dc4ca059e..000000000 --- a/docs/academy/py/standalone/index.md +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: Short units -description: "Set up Weaviate in standalone mode for streamlined local or small-scale applications." -sidebar_position: 900 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Overview - -This section hosts short, standalone units that you can read independently of any other topics. - -## Units - - diff --git a/docs/academy/py/standalone/which_search/05_review.mdx b/docs/academy/py/standalone/which_search/05_review.mdx deleted file mode 100644 index 39b0ed9a9..000000000 --- a/docs/academy/py/standalone/which_search/05_review.mdx +++ /dev/null @@ -1,119 +0,0 @@ ---- -title: Review of search types -description: A review of search strategies in Weaviate to refine your data queries. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/05_review.py'; - -## Overview - -Weaviate offers three primary search types - namely **vector**, **keyword**, and **hybrid** searches. Let's briefly recap what they are, and how they work. - -### Code examples - -These code examples are runnable, with the [`v4` Weaviate Python client](/weaviate/client-libraries/python/index.mdx). Connect to the pre-configured demo instance of Weaviate with the following code, and try the examples below. - - - - - - - - - -## Vector search - -A vector search finds objects with the most similar vectors to the query vector. - -Because each vector is a numerical representation of the underlying object, a vector similarity can be thought of as a similarity in meaning. Therefore a vector search is also called "semantic search". - -In Weaviate, you can search for objects with similar vectors in any of the following ways: - -With a source medium (e.g. text or image): - - - - - - - - - -With a vector: - - - - - - - - - -With an existing Weaviate object: - - - - - - - - - -## Keyword search - -A keyword search finds objects whose keywords (i.e. tokens) are the most relevant to the keywords (i.e. tokens) of the query. The relevance is determined by the [BM25F algorithm](https://en.wikipedia.org/wiki/Okapi_BM25). - -Intuitively, the BM25F algorithm determines "relevance" by considering how often a keyword appears in each field of the object, relative to how commonly the keyword appears in the entire dataset. - - - - - - - - - -## Hybrid search - -A hybrid search combines the results of a vector search and a keyword search. This is done by performing both searches, and them combining the two search results with a "fusion" algorithm. - - - - - - - - diff --git a/docs/academy/py/standalone/which_search/10_strengths.mdx b/docs/academy/py/standalone/which_search/10_strengths.mdx deleted file mode 100644 index cd5532bc4..000000000 --- a/docs/academy/py/standalone/which_search/10_strengths.mdx +++ /dev/null @@ -1,204 +0,0 @@ ---- -title: Strengths of each search type -description: Discover the strengths of each Weaviate search approach for specific use cases. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/10_strengths.py'; - - -## Overview - -These different search types are offered because they each have different characteristics, and therefore different strengths. - -Let's explore the relative strengths of each search type. - -## Strengths of vector search - -### Robustness - -A vector is a numerical representation of the underlying object's meaning. As a result, a vector search is robust to any changes that don't affect the meaning of the object. - -More concretely, a vector of "cat", for example, will be similar to a vector of "kitten", "feline", and "pet", even though their spellings are very different. - -See this in action below, where we search for "cat" and "kitten" using vector search. - - - - - - - - - -You see that the results for "cat" and "kitten" are very similar. - -In other words, the vectors for "cat" and "kitten" are similar in meaning, because the model can "understand" meaning. - - - -Similarly, a vector of "cat" is similar to the vector of "cat" with a spelling mistake, such as "caat", or "catt". - - - - - - - - - -Here, the results are basically identical. - - - -This robustness is a key strength of vector search, as it means that the searcher does not need to know the exact words used in the dataset. This is particularly useful when the concepts being searched for are not well-defined, or when the searcher is not familiar with the dataset. - -### Versatility - -A vector search is also versatile. It can be used to search multiple data modalities (e.g. text, images, audio, etc.), and across multiple languages. - -This is because the vector is a numerical representation of the underlying object's meaning, and therefore the same vector can be used to represent the same meaning, regardless of the data modality or language. - -In fact, some models are capable of search across multiple data modalities, or multiple languages (or both!). This is made possible by using a model that can convert the data into comparable vectors, regardless of the data modality or language. - -![Vectorization across multiple modalities](./_img/multimodal_example.png) - -## Strengths of keyword search - -### Exactitude - -Keyword search is ideal for situations where locating precise matches are required. This is particularly useful in scenarios where there exist exact terms to search for, such as academic research, searches through domain-specific data or technical troubleshooting. - -The ability to return results that precisely match the search terms ensures that users receive the most relevant information for their specific queries. - -More concretely, take a look at the example below, where we search for "imaging". - - - - - - - - - -And when we inspect the results: - - - -A search for "imaging" using a keyword search returns the one result that contains that specific word. - - -## Strengths of hybrid search - -A key strength of hybrid search is its resiliency. Let's explore this in more detail. - -### Resiliency - -A hybrid search is resilient as it combines top results from both vector and keyword search. This helps to mitigate either search's shortcomings. - -Take a look at the hybrid search example below. - - - - - - - - - -We can inspect the results here: - - - -You can see that as well as the keyword search result above (for "imaging"), we get a semantically relevant result (for "X-rays"). - -Because hybrid search combines the results of both vector and keyword search, it will find objects that score well on at least one of the search types. This approach has the effect of complementing each search type. - - - diff --git a/docs/academy/py/standalone/which_search/20_selection.mdx b/docs/academy/py/standalone/which_search/20_selection.mdx deleted file mode 100644 index aa2d4749d..000000000 --- a/docs/academy/py/standalone/which_search/20_selection.mdx +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: Selecting the right search type -description: Select the best Weaviate search strategy to optimize results. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/20_selection.py'; - - -## Overview - -Selecting the right search type is key for effective search tasks. Let's explore how to select the right search type for your needs. - - -## Rules of thumb - - -### When to use vector search - -Vector search is the most robust and versatile search type. As such, it is well-suited for situations where the the meaning, or the vector representation, is of the highest importance. - -In cross-modal, object-based or multi-lingual searches, vector search may be the only viable option. - -Start with vector search for: - -- **Non-text, or cross-modal searches**: Essential for searching across different types of media, like finding images using text descriptions or vice versa. -- **Object-based searches**: For finding similar objects to an extracted text chunk, image, or video, vector search is likely the only viable solution. -- **Multi-lingual contexts**: The go-to choice for handling searches in multiple languages, where traditional keyword-based search may fall short. -- **Complex query understanding**: Vector search excels in interpreting and responding to complex queries that require understanding context or nuances in language. - -### When to use keyword search - -Keyword search is useful when there is an expectation or requirement to match the exact search terms. This can be the case for specific domains such as legal, medical or technical areas where the exact terminology is important. - -Keyword search is also useful when the user is unlikely to make mistakes in inputs and is inputting a predictable set of terms, such as through a sanitized form or a drop-down menu. - -In summary, start with keyword search for: - -**Exact term matching**: Ideal in domains like legal, medical, or technical fields where specific terminology is crucial. -**Predictable user inputs**: Works well when users are expected to input a defined set of terms, like through forms or drop-down menus. -**Simple and direct queries**: Effective for straightforward search needs where the complexity of natural language processing is not required. -**Fast and specific results**: Suitable for quick retrieval of information based on specific keywords or phrases. - -### When to use hybrid search - -Hybrid search is a great choice for "messy" situations. - -Because hybrid search combines results sets from both vector and keyword searches, it is able to provide a good balance between the robustness of vector search and the exactitude of keyword search. - -As a result, hybrid search is a generally good choice for most search needs that do not fall into the specific use cases of vector or keyword search. - -In summary, consider hybrid search for: - -- **Broad topic ranges**: Effective in scenarios where the target corpus covers a wide array of subjects, requiring a versatile search approach. -- **Versatile search scenarios**: Useful for real-life scenarios that often require a combination of results from both vector and keyword searches. -- **Unpredictable user inputs**: Ideal for many real-life scenarios where the user has free reign over the query. Some user queries may be aimed at direct matches while others' queries may be more about the overall meaning. - - - diff --git a/docs/academy/py/standalone/which_search/30_strategies.mdx b/docs/academy/py/standalone/which_search/30_strategies.mdx deleted file mode 100644 index c091c1df3..000000000 --- a/docs/academy/py/standalone/which_search/30_strategies.mdx +++ /dev/null @@ -1,210 +0,0 @@ ---- -title: Strategies to improve search results -description: Explore various Weaviate search strategies to enhance data retrieval. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/30_strategies.py'; - - -## Overview - -In addition to selecting the right search types, there are also strategies you can employ to improve the quality of your search results. - -Let's explore some of these strategies. - -## Improve vector search - -The key to improving vector search is to make sure that the vector representation of the object is fit for purpose, so as to suit the search needs. - -### Vectorizer selection - -Unless you are inserting data with your own vectors, you will be using a Weaviate vectorizer module, and a model within that module, to generate vectors for your data. - -The choice of vectorizer module and model is important, as it will determine what aspects of the data are captured in the vector representation, and how well the model is able to "understand" the data. - -First and foremost, you should select a vectorizer module that is best suited for your data type. For example, if you are working with text data, you should use the `text2vec` module, and if you are using image or multi-modal data, you should likely use the `multi2vec` module. - -We will cover vectorizer selection in another unit. But, if you are not sure where to start, try: -- `text2vec-cohere`, or `text2vec-openai` for text data (API-based) - - Cohere offers a multi-lingual model that can be used with over 100 languages. -- `multi2vec-clip` for image or image and text data. - -If you are working with text and prefer to run a local inference container, try `text2vec-transformers`, with a popular model such as `sentence-transformers/all-MiniLM-L12-v2`. - -### Try a re-ranker - -Re-ranker modules are a great way to improve the quality of your search results. - -A re-ranker module is a module that takes in the results of a vector search, and re-ranks the results based on additional criteria, or a different model. This allows a higher-quality (but slower) model to be used for re-ranking, while still benefiting from the fast first stage search. - -For example, you can use the `text2vec-cohere` module to perform a vector search, and then use the `reranker-cohere` module to re-rank the results using a different model. - -### Property selection - -Vectorization captures the "meaning" of the object. Accordingly, if a property is not relevant to the criteria to be applied for search, it should be excluded from the vectorization process. - -As an example, if a product object includes metadata such as its manufacturing process or location, and the vector search is intended to be based on the product's features, then the properties for manufacturing process and location should be excluded from the vectorization process. - -You can do this by specifying whether to skip a property during vectorization, as shown below. Note that you can do the same with the collection name, and the property name. - - - - - - - - - -### Chunking - -Chunking refers to the process of splitting a text into smaller chunks, and vectorizing each chunk separately. This is very important, as it defines how much information each vector contains. - -As a rule of thumb, the more granular the search needs, the smaller the chunk size should be. For example, if you are searching for specific concepts and ideas, you should chunk data into smaller units such as sentences or small windows of text. Alternatively, if you are searching for broader concepts, such as finding relevant chapters or books, you might chunk text accordingly. - -Read more about it in the [chunking unit](../chunking/index.mdx) of Weaviate Academy. - -## Improve keyword search - -### Tokenization - -Although we refer to BM25 search as a "keyword" search, in reality the exact matches are for "tokens", rather than words. This is a different tokenization process to that used for generating vector embeddings, but instead, it is used to build the inverted index for BM25 searches and filtering. - -Accordingly, the tokenization process is very important, as it determines what tokens are used for matching. - -The available options are: `word`, `lowercase`, `whitespace`, and `field`. The default (`word`) might be sufficient for prose, but for text where exact matches including case and symbols are important, something like `whitespace` might be more appropriate. - -Available tokenization options: - -import TokenizationDefinition from '/_includes/tokenization_definition.mdx'; - - - -You can set tokenization in the collection configuration. - - - - - - - - - -### Select and boost properties - -If you observe that matches in some properties are having too much of an impact, you can exclude them from the search, and/or boost the importance certain properties. - -For example, matches in the `description` property might be more important than matches in the `notes` property. You can specify this at query time. - - - - - - - - - -## Improve hybrid search - -### Alpha - -The alpha parameter determines the balance between the vector and keyword search results. - -If you want to configure your search to be more vector-based, you can increase the alpha value. Conversely, if you want to configure your search to be more keyword-based, you can decrease the alpha value. - - - - - - - - - -### Fusion algorithm - -The fusion algorithm determines how the results from the vector and keyword searches are combined. - -By default, an inverse of the ranks from each results set are summed, in what is called the "ranked fusion" algorithm. However, you can also use the "relative score fusion" algorithm, which sums normalized scores from each results set. - -Generally, we have found that the "relative score fusion" algorithm works better, but you should try both to see which works best for your use case. - - - - - - - - - - diff --git a/docs/academy/py/standalone/which_search/_30_improve_search.mdx b/docs/academy/py/standalone/which_search/_30_improve_search.mdx deleted file mode 100644 index 2447a084e..000000000 --- a/docs/academy/py/standalone/which_search/_30_improve_search.mdx +++ /dev/null @@ -1,13 +0,0 @@ ---- -title: Improving search ---- - -- Evaluating search quality -- Improving search quality - - Vectorizer - - Data structure - - Search type -- Improving search speed - - Inference speed - - Resource demands - - Query bottlenecks diff --git a/docs/academy/py/standalone/which_search/_img/multimodal_example.png b/docs/academy/py/standalone/which_search/_img/multimodal_example.png deleted file mode 100644 index 628d4d5be..000000000 Binary files a/docs/academy/py/standalone/which_search/_img/multimodal_example.png and /dev/null differ diff --git a/docs/academy/py/standalone/which_search/_snippets/05_review.py b/docs/academy/py/standalone/which_search/_snippets/05_review.py deleted file mode 100644 index 4982b0e95..000000000 --- a/docs/academy/py/standalone/which_search/_snippets/05_review.py +++ /dev/null @@ -1,105 +0,0 @@ -from uuid import UUID -import os - -weaviate_url = os.getenv("WEAVIATE_URL") -weaviate_key = os.getenv("WEAVIATE_API_KEY") - -# START connectionCode -import weaviate -import weaviate.classes as wvc - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=weaviate_url, - auth_credentials=wvc.init.Auth.api_key(weaviate_key), - headers={ - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") # Replace with your OpenAI API key (for vector and hybrid searches) - } -) -# END connectionCode - -# START nearTextExample -questions = client.collections.use("JeopardyQuestion") -response = questions.query.near_text( - query="space travel", # Your query string - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(o.properties) -# END nearTextExample - -assert len(response.objects) == 2 -assert "question" in response.objects[0].properties.keys() - - -response = questions.query.near_text( - query="space travel", - limit=1, - include_vector=True -) -vector_input = response.objects[0].vector -object_input = response.objects[0].uuid - -# START nearVectorExample -questions = client.collections.use("JeopardyQuestion") -response = questions.query.near_vector( - near_vector=vector_input, # Your vector object - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(o.properties) -# END nearVectorExample - -assert len(response.objects) == 2 -assert "question" in response.objects[0].properties.keys() - - -# START nearObjectExample -questions = client.collections.use("JeopardyQuestion") -response = questions.query.near_object( - near_object=object_input, # Your object UUID - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(o.properties) -# END nearObjectExample - -assert len(response.objects) == 2 -assert "question" in response.objects[0].properties.keys() - - -# START bm25Example -questions = client.collections.use("JeopardyQuestion") -response = questions.query.bm25( - query="space travel", # Your query string - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(o.properties) -# END bm25Example - -assert len(response.objects) == 2 -assert "question" in response.objects[0].properties.keys() - - -# START hybridExample -questions = client.collections.use("JeopardyQuestion") -response = questions.query.hybrid( - query="space travel", # Your query string - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(o.properties) -# END hybridExample - -assert len(response.objects) == 2 -assert "question" in response.objects[0].properties.keys() diff --git a/docs/academy/py/standalone/which_search/_snippets/10_strengths.py b/docs/academy/py/standalone/which_search/_snippets/10_strengths.py deleted file mode 100644 index 41f5b71ee..000000000 --- a/docs/academy/py/standalone/which_search/_snippets/10_strengths.py +++ /dev/null @@ -1,160 +0,0 @@ -from uuid import UUID -import os -import weaviate -import weaviate.classes as wvc -import json - -weaviate_url = os.getenv("WEAVIATE_URL") -weaviate_key = os.getenv("WEAVIATE_API_KEY") - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=weaviate_url, - auth_credentials=wvc.init.Auth.api_key(weaviate_key), - headers={ - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") - } -) - - -# START robustnessExampleWords -for query in ["cat", "kitten"]: - questions = client.collections.use("JeopardyQuestion") - response = questions.query.near_text( - query=query, - limit=1, - return_metadata=wvc.query.MetadataQuery(distance=True), - return_properties=["question", "answer"] - ) - - for o in response.objects: - print(f"\n===== Search results for {query} =====") - print(f"Distance: {o.metadata.distance:.3f}") - print(json.dumps(o.properties, indent=2)) -# END robustnessExampleWords - -assert len(response.objects) == 1 -assert "question" in response.objects[0].properties.keys() - - -""" -# START responseRobustnessExampleWords -===== Search results for cat ===== -Distance: 0.170 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} - -===== Search results for kitten ===== -Distance: 0.150 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} -# END responseRobustnessExampleWords -""" - - - -# START robustnessExampleSpelling -for query in ["cat", "catt", "caat"]: - questions = client.collections.use("JeopardyQuestion") - response = questions.query.near_text( - query=query, - limit=1, - return_metadata=wvc.query.MetadataQuery(distance=True), - return_properties=["question", "answer"] - ) - - for o in response.objects: - print(f"\n===== Search results for {query} =====") - print(f"Distance: {o.metadata.distance:.3f}") - print(json.dumps(o.properties, indent=2)) -# END robustnessExampleSpelling - -assert len(response.objects) == 1 -assert "question" in response.objects[0].properties.keys() - - -""" -# START responseRobustnessExampleSpelling -===== Search results for cat ===== -Distance: 0.170 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} - -===== Search results for catt ===== -Distance: 0.177 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} - -===== Search results for caat ===== -Distance: 0.184 -{ - "answer": "Fat cat", - "question": "A flabby tabby" -} -# END responseRobustnessExampleSpelling -""" - - -# START bm25Example -questions = client.collections.use("JeopardyQuestion") -response = questions.query.bm25( - query="imaging", # Your query string - return_properties=["question", "answer"], - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(json.dumps(o.properties, indent=2)) -# END bm25Example - -assert "question" in response.objects[0].properties.keys() - -""" -# START bm25Results -49fe3d7c-61a5-5916-99bb-052d07c7c251 -{ - "answer": "magnetic resonance imaging", - "question": "MRI, which stands for this, cannot be used on patients with pacemakers or artificial metal joints" -} -# END bm25Results -""" - - -# START hybridExample -questions = client.collections.use("JeopardyQuestion") -response = questions.query.hybrid( - query="imaging", # Your query string - return_properties=["question", "answer"], - limit=2 -) - -for o in response.objects: - print(o.uuid) - print(json.dumps(o.properties, indent=2)) -# END hybridExample - -assert "question" in response.objects[0].properties.keys() - - -""" -# START hybridResults -49fe3d7c-61a5-5916-99bb-052d07c7c251 -{ - "answer": "magnetic resonance imaging", - "question": "MRI, which stands for this, cannot be used on patients with pacemakers or artificial metal joints" -} -9041bce6-b5d1-5637-bcbe-2ebb8a689fe0 -{ - "answer": "X-rays", - "question": "These electromagnetic rays used to take pictures of your insides were originally known as Roentgen rays" -} -# END hybridResults -""" diff --git a/docs/academy/py/standalone/which_search/_snippets/20_selection.py b/docs/academy/py/standalone/which_search/_snippets/20_selection.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/academy/py/standalone/which_search/_snippets/30_strategies.py b/docs/academy/py/standalone/which_search/_snippets/30_strategies.py deleted file mode 100644 index 00aea2d7c..000000000 --- a/docs/academy/py/standalone/which_search/_snippets/30_strategies.py +++ /dev/null @@ -1,135 +0,0 @@ -from uuid import UUID -import os -import weaviate -import weaviate.classes as wvc -import json - - -client = weaviate.connect_to_local( - headers={ - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") - } -) - - -client.collections.delete("Product") - -# START skipVectorizationExample -products = client.collections.create( - name="Product", - vector_config=wvc.Configure.Vectors.text2vec_openai( - # highlight-start - vectorize_collection_name=True - # highlight-end - ), - properties=[ - wvc.config.Property( - name="name", - data_type=wvc.config.DataType.TEXT, - # highlight-start - vectorize_property_name=True, - # highlight-end - ), - wvc.config.Property( - name="description", - data_type=wvc.config.DataType.TEXT, - ), - wvc.config.Property( - name="manufacturing_process", - data_type=wvc.config.DataType.TEXT, - # highlight-start - skip_vectorization=True # Skip unwanted property - # highlight-end - ), - ] -) -# END skipVectorizationExample - -client.collections.delete("Product") - - - -client.collections.delete("SomeCollection") - -# START tokenizationExample -things = client.collections.create( - name="SomeCollection", - properties=[ - wvc.config.Property( - name="name", - data_type=wvc.config.DataType.TEXT, - # highlight-start - tokenization=wvc.config.Tokenization.WORD # Default - # highlight-end - ), - wvc.config.Property( - name="description", - data_type=wvc.config.DataType.TEXT, - tokenization=wvc.config.Tokenization.WHITESPACE # Will keep case & special characters - ), - wvc.config.Property( - name="email", - data_type=wvc.config.DataType.TEXT, - # highlight-start - tokenization=wvc.config.Tokenization.FIELD # Do not tokenize at all - # highlight-end - ), - ] -) -# END tokenizationExample - -client.collections.delete("SomeCollection") - -from weaviate.classes.init import Auth - -weaviate_url = os.getenv("WEAVIATE_URL") -weaviate_key = os.getenv("WEAVIATE_API_KEY") - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=weaviate_url, - auth_credentials=Auth.api_key(weaviate_key), - headers={ - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") - } -) - -# START selectAndBoostExample -questions = client.collections.use("JeopardyQuestion") - -response = questions.query.bm25( - "animal", - limit=5, - query_properties=["question^3", "answer"] # Boost the impact of "question" property by 3 -) - -for o in response.objects: - print(o.properties) -# END selectAndBoostExample - - -# START adjustAlpha -questions = client.collections.use("JeopardyQuestion") - -response = questions.query.hybrid( - "imaging", - alpha=0.1, # Mostly a vector search (Try it with alpha=0.9) - limit=5 -) - -for o in response.objects: - print(o.properties) -# END adjustAlpha - - -# START changeFusionType -questions = client.collections.use("JeopardyQuestion") - -response = questions.query.hybrid( - "imaging", - fusion_type=wvc.query.HybridFusion.RELATIVE_SCORE, # Use relative score fusion - limit=5 -) - -for o in response.objects: - print(o.properties) -# END changeFusionType diff --git a/docs/academy/py/standalone/which_search/index.mdx b/docs/academy/py/standalone/which_search/index.mdx deleted file mode 100644 index 73e99beb6..000000000 --- a/docs/academy/py/standalone/which_search/index.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: Which search is right for me? -description: Compare different Weaviate search methods to choose the best for your project. -sidebar_position: 10 ---- - -## Unit overview - - - - - - - -Weaviate offers three distinct search methods - namely **vector**, **keyword**, and **hybrid** searches. - -Each method has its unique strengths and applicabilities, making the selection critical to the success of your search-related tasks. - -This section compares these search types to equip you with the knowledge to intuit when and why to employ each of these search methodologies. - -We will explore how the choice of search type impacts not only the quality of the search results but also the overall performance of the search operation. - -Then, we will also discuss strategies to improve the quality of search results, as well as the performance of the search operation. - - -### Prerequisites - -- A Python (3) environment with `weaviate-client` installed. -- Familiarity with Weaviate's search capabilities. -- Intermediate coding proficiency (e.g. Python). -- (Recommended) Complete [Queries 1](../../zero_to_mvp/102_queries_1/index.mdx) & [Queries 2](../../zero_to_mvp/104_queries_2/index.mdx). - - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/10_client.mdx b/docs/academy/py/starter_custom_vectors/101_setup_weaviate/10_client.mdx deleted file mode 100644 index 0099489fa..000000000 --- a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/10_client.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Weaviate Python client -description: Client Setup for Custom Vectors in Weaviate ---- - -## Installation - -The latest Weaviate Python client library can be installed using pip. The client library is tested on Python 3.8 and later. Install it using the following command: - -```bash -pip install -U weaviate-client -``` - -The latest major version is `v4` (e.g. `4.x.x`). You can check the version like so: - -```bash -pip show weaviate-client -``` - -## Basic usage - -From Python, you can load the Weaviate client library like so: - -```python -import weaviate -``` - -The client provides sets of helper classes (e.g. under `weaviate.classes`) and functions to make it easier to interact with Weaviate. - -Next, we'll show you how create a Weaviate instance and connect to it. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/10_create_wcs.mdx b/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/10_create_wcs.mdx deleted file mode 100644 index afee6d409..000000000 --- a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/10_create_wcs.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "Option 1: A Weaviate Cloud (WCD) instance" -description: Creating Weaviate Cloud Instance for Custom Vectors ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../../_snippets/101_connect.py'; - -Here, you will create a Weaviate Cloud (WCD) instance. WCD is a fully managed Weaviate instance that runs in the cloud. It's a great way to get started with Weaviate, as it requires no installation or maintenance. - -### Log in to the WCD Console - -Go to the [WCD Console](https://console.weaviate.cloud/) and log in with your credentials. If you don't have an account yet, you can sign up by clicking on the Register here link from the login screen. - -### Create a Weaviate instance - -From the console, go to the Dashboard and click on the Create cluster button. From the following screen: - -- Select the "Free sandbox" tab -- Provide a cluster name -- Set "Enable authentication" to "Yes" - -Click on the Create button to create your Weaviate instance. The process will take a few minutes. - -### Retrieve your Weaviate instance details - -Once the instance is created, you will be able see its details by clicking on the Details button. Find the cluster URL and the API key. - -You will need these details to connect to your Weaviate instance. - -### Connect to your WCD instance - -To connect to the Weaviate Cloud (WCD) instance, you need to use the cluster URL and the API key. You can find these details in the WCD Console. - -Use the `connect_to_weaviate_cloud` function to connect to your WCD instance. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses Cohere, so you can provide the Cohere API key to Weaviate through `headers={"X-Cohere-Api-Key": }` as shown below: - - - -:::note What next? -If you have completed this, you can skip the next page [Option 2: A local Weaviate instance](./20_create_docker.mdx) and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/20_create_docker.mdx b/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/20_create_docker.mdx deleted file mode 100644 index 0ee50b229..000000000 --- a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/20_create_docker.mdx +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: "Option 2: A local Docker instance" -description: Creating Weaviate Docker Instance for Custom Vectors ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../../_snippets/101_connect.py'; - -:::note Have you already created a Weaviate instance? -If you have [created a cloud instance](./10_create_wcs.mdx) of Weaviate, you can skip this page and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -Here, you will create a Weaviate instance using Docker. - -### Download and run the docker-compose file - -Install Docker on your machine. We recommend following the [official Docker installation guide](https://docs.docker.com/get-docker/). - -Create a new directory and navigate to it in your terminal. Then, create a new file called `docker-compose.yml` and add the following content: - -```yaml ---- -services: - weaviate_anon: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - restart: on-failure:0 - environment: - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ENABLE_API_BASED_MODULES: 'true' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node1' -... - -``` - -### Create a Weaviate instance - -Run the following command to start Weaviate: - -```bash -docker compose up -``` - -### Retrieve your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -### Connect to your Weaviate instance - -To connect to the Weaviate instance, use the `connect_to_local` function. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses Cohere, so you can provide the Cohere API key to Weaviate through `headers={"X-Cohere-Api-Key": }` as shown below: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/index.mdx b/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/index.mdx deleted file mode 100644 index 64e8deb59..000000000 --- a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/20_create_instance/index.mdx +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Create a Weaviate instance -description: Creating a Custom Vectors Instance in Weaviate ---- - -For this unit, you can choose to create a Weaviate Cloud (WCD) instance or a local Docker instance. - -- [Create a Weaviate Cloud (WCD) instance](./10_create_wcs.mdx) - - If you want a managed service and don't want to worry about installation and maintenance. -- [Create a local Docker instance](./20_create_docker.mdx) - - If you want to run Weaviate on your local machine, or want to have full control over the installation and maintenance. - -Either option is fine for this course. If you're not sure which to choose, we recommend starting with a WCD instance. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/30_communicate.mdx b/docs/academy/py/starter_custom_vectors/101_setup_weaviate/30_communicate.mdx deleted file mode 100644 index a96584db9..000000000 --- a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/30_communicate.mdx +++ /dev/null @@ -1,65 +0,0 @@ ---- -title: Communicate with Weaviate -description: Communication Setup in Weaviate for Custom Vectors ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/101_connect.py'; - -Here, we'll perform basic operations to communicate with Weaviate using the Python client library. - -### Check Weaviate status - -You can check whether the Weaviate instance is up using the `is_live` function. - - - -### Retrieve server meta information - -You can retrieve meta information about the Weaviate instance using the `meta` function. - - - -This will print the server meta information to the console. The output will look similar to the following: - -
- Example get_meta output - - -
- -### Close the connection - -After you have finished using the Weaviate client, you should close the connection. This frees up resources and ensures that the connection is properly closed. - -We suggest using a `try`-`finally` block as a best practice. For brevity, we will not include the `try`-`finally` blocks in the remaining code snippets. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/index.mdx b/docs/academy/py/starter_custom_vectors/101_setup_weaviate/index.mdx deleted file mode 100644 index 1a3728da2..000000000 --- a/docs/academy/py/starter_custom_vectors/101_setup_weaviate/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Set up Weaviate -description: Step-by-step guide to setting up Weaviate with custom vectors. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/102_object_collections/10_preparation.mdx b/docs/academy/py/starter_custom_vectors/102_object_collections/10_preparation.mdx deleted file mode 100644 index 853153a08..000000000 --- a/docs/academy/py/starter_custom_vectors/102_object_collections/10_preparation.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Preparation ---- - -In this section you are going to populate your Weaviate instance with a movie dataset and corresponding vectors which are generated outside of Weaviate. - -### Weaviate instance - -Make sure to have your Weaviate instance set up. You should have [created an instance](../101_setup_weaviate/20_create_instance/index.mdx) and be able to connect to it. - - - -### Source data - -We are going to use a movie dataset sourced from [TMDB](https://www.themoviedb.org/). The dataset can be found in this [GitHub repository](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json), and it contains bibliographic information on ~700 movies released between 1990 and 2024. - -
- See sample data - -| | backdrop_path | genre_ids | id | original_language | original_title | overview | popularity | poster_path | release_date | title | video | vote_average | vote_count | -|---:|:---------------------------------|:----------------|-----:|:--------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------:|:---------------------------------|:---------------|:----------------------------|:--------|---------------:|-------------:| -| 0 | /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg | [14, 18, 10749] | 162 | en | Edward Scissorhands | A small suburban town receives a visit from a castaway unfinished science experiment named Edward. | 45.694 | /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg | 1990-12-07 | Edward Scissorhands | False | 7.7 | 12305 | -| 1 | /sw7mordbZxgITU877yTpZCud90M.jpg | [18, 80] | 769 | en | GoodFellas | The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway. | 57.228 | /aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg | 1990-09-12 | GoodFellas | False | 8.5 | 12106 | -| 2 | /6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg | [35, 10751] | 771 | en | Home Alone | Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. But when a pair of bungling burglars set their sights on Kevin's house, the plucky kid stands ready to defend his territory. By planting booby traps galore, adorably mischievous Kevin stands his ground as his frantic mother attempts to race home before Christmas Day. | 3.538 | /onTSipZ8R3bliBdKfPtsDuHTdlL.jpg | 1990-11-16 | Home Alone | False | 7.4 | 10599 | -| 3 | /vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg | [12, 35, 878] | 196 | en | Back to the Future Part III | The final installment of the Back to the Future trilogy finds Marty digging the trusty DeLorean out of a mineshaft and looking for Doc in the Wild West of 1885. But when their time machine breaks down, the travelers are stranded in a land of spurs. More problems arise when Doc falls for pretty schoolteacher Clara Clayton, and Marty tangles with Buford Tannen. | 28.896 | /crzoVQnMzIrRfHtQw0tLBirNfVg.jpg | 1990-05-25 | Back to the Future Part III | False | 7.5 | 9918 | -| 4 | /3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg | [35, 10749] | 114 | en | Pretty Woman | When a millionaire wheeler-dealer enters a business contract with a Hollywood hooker Vivian Ward, he loses his heart in the bargain. | 97.953 | /hVHUfT801LQATGd26VPzhorIYza.jpg | 1990-03-23 | Pretty Woman | False | 7.5 | 7671 | - -
- -Next, you will create a corresponding object collection and import the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/102_object_collections/20_create_collection.mdx b/docs/academy/py/starter_custom_vectors/102_object_collections/20_create_collection.mdx deleted file mode 100644 index 3cbf8bc8d..000000000 --- a/docs/academy/py/starter_custom_vectors/102_object_collections/20_create_collection.mdx +++ /dev/null @@ -1,85 +0,0 @@ ---- -title: Create a collection -description: Creating an Object Collection in Custom Vectors ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -Weaviate stores data in "collections". A collection is a set of objects that share the same data structure. In our movie database, we might have a collection of movies, a collection of actors, and a collection of reviews. - -Here we will create a collection of movies. - -## Code - -This example creates a collection for the movie data: - - - -Each collection definition must have a name. Then, you can define additional parameters like we've done in this example. - -## Explain the code - -### Properties - -Properties are the object attributes that you want to store in the collection. Each property has a name and a data type. - -In our movie database, we have properties like `title`, `release_date` and `genre_ids`, with data types like `TEXT` (string), `DATE` (date), or `INT` (integer). It's also possible to have arrays of integers, like we have with `genre_ids`. - -#### Auto-schema - -Weaviate can automatically [infer the schema](/weaviate/config-refs/collections.mdx#auto-schema) from the data. However, it's a good practice to define the properties explicitly, for better control and to avoid surprises. - -### Vectorizer configuration - -In this code example, we specify the vectorizer as `none`, as we will specify our own vectors. - - - -### Generative configuration - -If you wish to use your collection with a generative model (e.g. a large language model), you must specify the generative module. - -In this code example, we specify the `cohere` module (`generative-cohere` is the full name) with default options. - - - -import MutableGenerativeConfig from '/_includes/mutable-generative-config.md'; - - - -### Python classes - -The code example makes use of classes such as `Property`, `DataType` and `Configure`. They are defined in the `weaviate.classes.config` submodule and are used to define the collection. - -For convenience, we import the submodule as `wc` and use classes from it. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/102_object_collections/25_generate_vectors.mdx b/docs/academy/py/starter_custom_vectors/102_object_collections/25_generate_vectors.mdx deleted file mode 100644 index 10fb112ef..000000000 --- a/docs/academy/py/starter_custom_vectors/102_object_collections/25_generate_vectors.mdx +++ /dev/null @@ -1,70 +0,0 @@ ---- -title: (Bonus) Generate vectors ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -Since we are using custom vectors, we need to generate them ourselves. - -This step is optional, as the next section shows you how to download and use the pre-generated vectors. But if you are interested in how to generate vectors, read on. - -### Code - -This example creates embeddings for the movie dataset: - - - -This will generate a vector for each movie in the dataset, which we can use when adding the movies to Weaviate. - -## Explain the code - -### Model - -We use the `embed-multilingual-v3.0` Cohere model to generate the vector embeddings. You could also use the `transformers` library, if you would like to perform the generation locally. - -### Source text - -We combine the movie title and overview to create a source string for the model. This is the text that the model will "translate" into a vector. - - - -### Get embeddings in batches - -We use a buffer to store the concatenated strings, and then get the embeddings in batches. This is a good practice to limit the number of requests to the model, and to avoid timeouts. - - - -### Export the embeddings - -The embeddings are then saved to a file so that we can use when adding the movies to Weaviate. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/102_object_collections/30_import_data.mdx b/docs/academy/py/starter_custom_vectors/102_object_collections/30_import_data.mdx deleted file mode 100644 index 6112ace47..000000000 --- a/docs/academy/py/starter_custom_vectors/102_object_collections/30_import_data.mdx +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: Import data -description: Importing Data to Custom Vectors Object Collection ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -## Code - -This example imports the movie data into our collection. - - - -The code: -- Loads the source data & gets the collection -- Enters a context manager with a batcher (`batch`) object -- Loops through the data and adds objects and corresponding vectors to the batcher -- Prints out any import errors - -## Explain the code - -### Preparation - -We use the requests library to load the data from the source, in this case a JSON file. The data is then converted to a Pandas DataFrame for easier manipulation. - -Then, we create a collection object (with `client.collections.get`) so we can interact with the collection. - -### Batch context manager - -The `batch` object is a context manager that allows you to add objects to the batcher. This is useful when you have a large amount of data to import, as it abstracts away the complexity of managing the batch size and when to send the batch. - - - -This example uses the `.fixed_size()` method to create a batcher which sets the number of objects per batch. There are also other batcher types, like `.rate_limit()` for specifying the number of objects per minute and `.dynamic()` to create a dynamic batcher, which automatically determines and updates the batch size during the import process. - -### Add data to the batcher - -#### Convert data types - -The data is converted from a string to the correct data types for Weaviate. For example, the `release_date` is converted to a datetime object, and the `genre_ids` are converted to a list of integers. - - - -#### Add objects to the batcher - -Then we loop through the data and add each object to the batcher. The `batch.add_object` method is used to add the object to the batcher, and the batcher will send the batch according to the specified batcher type. - -Note here that we provide the vector data as well. - - - -### Error handling - -Because a batch includes multiple objects, it's possible that some objects will fail to import. The batcher saves these errors. - -You can print out the errors to see what went wrong, and then decide how to handle them, such as by raising an exception. In this example, we simply print out the errors. - - - -Note that the list of errors is cleared when a new context manager is entered, so you must handle the errors before initializing a new batcher. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/102_object_collections/index.mdx b/docs/academy/py/starter_custom_vectors/102_object_collections/index.mdx deleted file mode 100644 index b3c5e4965..000000000 --- a/docs/academy/py/starter_custom_vectors/102_object_collections/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Populate the database -description: Manage custom object collections seamlessly in Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/103_object_searches/10_vector.mdx b/docs/academy/py/starter_custom_vectors/103_object_searches/10_vector.mdx deleted file mode 100644 index d97ecf4cd..000000000 --- a/docs/academy/py/starter_custom_vectors/103_object_searches/10_vector.mdx +++ /dev/null @@ -1,76 +0,0 @@ ---- -title: Vector search ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_10_vector.py'; - -In this scenario, we've added data objects and our own vectors. Accordingly, any similarity searches will also require a vector input. This can be done with a `near vector` query. - -### Code - -This example finds entries in "Movie" based on their similarity to the input vector and prints out the title and release year of the top 5 matches. - - - -## Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object text. In this case, the embeddings are input manually in the query. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the vector distance to the query. - -
- Example results - -```text -In Time 2011 -Distance to query: 0.179 - -Gattaca 1997 -Distance to query: 0.180 - -I, Robot 2004 -Distance to query: 0.182 - -Mad Max: Fury Road 2015 -Distance to query: 0.190 - -The Maze Runner 2014 -Distance to query: 0.193 -``` - -
- -### Response object - -The returned object is an instance of a custom class. Its `objects` attribute is a list of search results, each object being an instance of another custom class. - -Each returned object will: -- Include all properties and its UUID by default except those with blob data types. -- Not include any other information (e.g. references, metadata, vectors.) by default. - -### Where did the query vector come from? - -The query vector in this example is obtained similarly to how it was in the [data ingestion](../102_object_collections/25_generate_vectors.mdx). The only difference is that the vector is not stored in the database, but is used directly in the query. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/103_object_searches/20_keyword_hybrid.mdx b/docs/academy/py/starter_custom_vectors/103_object_searches/20_keyword_hybrid.mdx deleted file mode 100644 index 1cc6b05ed..000000000 --- a/docs/academy/py/starter_custom_vectors/103_object_searches/20_keyword_hybrid.mdx +++ /dev/null @@ -1,107 +0,0 @@ ---- -title: Keyword & Hybrid search -description: Hybrid Keyword Searches in Object Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_20_searches.py'; - -You can also perform keyword (BM25) searches to find items based on their keyword similarity, or hybrid searches that combine BM25 and semantic/vector searches. - -## Keyword search - -### Code - -This example finds entries in "Movie" with the highest keyword search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a keyword search score using what's called the [BM25f](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the `score`, which is the BM25 score of the result. - -
- Example results - -```text -American History X 1998 -BM25 score: 2.707 - -A Beautiful Mind 2001 -BM25 score: 1.896 - -Legends of the Fall 1994 -BM25 score: 1.663 - -Hacksaw Ridge 2016 -BM25 score: 1.554 - -Night at the Museum 2006 -BM25 score: 1.529 -``` - -
- - -## Hybrid search - -### Code - -This example finds entries in "Movie" with the highest hybrid search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a hybrid search score. A hybrid search blends results of BM25 and semantic/vector searches. - -As we are using custom vectors, we provide the vector manually to the hybrid query using the `vector` parameter. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the `score`, which is the hybrid score of the result. - -
- Example results - -```text -Night at the Museum 2006 -Hybrid score: 0.016 - -The Butterfly Effect 2004 -Hybrid score: 0.014 - -Legends of the Fall 1994 -Hybrid score: 0.014 - -Hidden Figures 2016 -Hybrid score: 0.012 - -A Beautiful Mind 2001 -Hybrid score: 0.012 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/103_object_searches/30_filters.mdx b/docs/academy/py/starter_custom_vectors/103_object_searches/30_filters.mdx deleted file mode 100644 index 2a0b69067..000000000 --- a/docs/academy/py/starter_custom_vectors/103_object_searches/30_filters.mdx +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Filters -description: Filters for Object Searches ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_20_searches.py'; - -Filters can be used to precisely refine search results. You can filter by properties as well as metadata, and you can combine multiple filters with `and` or `or` conditions to further narrow down the results. - -### Code - -This example finds entries in "Movie" based on their similarity to the query vector, only from those released after 2010. It prints out the title and release year of the top 5 matches. - - - -## Explain the code - -This query is identical to [that shown earlier](./10_vector.mdx) for vector search, but with the addition of a filter. The `filters` parameter here takes an instance of the `Filter` class to set the filter conditions. The current query filters the results to only include those with a release year after 2010. - -
- Example results - -```text -Oppenheimer 2023 -Distance to query: 0.754 - -Everything Everywhere All at Once 2022 -Distance to query: 0.778 - -Meg 2: The Trench 2023 -Distance to query: 0.779 - -Eternals 2021 -Distance to query: 0.787 - -John Wick: Chapter 4 2023 -Distance to query: 0.790 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/103_object_searches/index.mdx b/docs/academy/py/starter_custom_vectors/103_object_searches/index.mdx deleted file mode 100644 index f7b7e2945..000000000 --- a/docs/academy/py/starter_custom_vectors/103_object_searches/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Perform searches -description: Optimize object-based searches with custom vectors in Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/104_object_rag/10_setup.mdx b/docs/academy/py/starter_custom_vectors/104_object_rag/10_setup.mdx deleted file mode 100644 index d4c7c22a3..000000000 --- a/docs/academy/py/starter_custom_vectors/104_object_rag/10_setup.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "RAG: Overview" -description: Setting up Object RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -### Motivation - -Retrieval augmented generation (RAG) is a way to combine the best of both worlds: the retrieval capabilities of semantic search and the generation capabilities of AI models such as large language models. This allows you to retrieve objects from a Weaviate instance and then generate outputs based on the retrieved objects. - -### Setup - -When we created a collection, we specified the `generative_module` parameter as shown here. - - - -This selects a generative module that will be used to generate outputs based on the retrieved objects. In this case, we're using the `cohere` module, and the `Command-R` family of large language models. - -As we did before with the vectorizer module, you will require an API key from the provider of the generative module. In this case, you will need an API key from Cohere. - -### RAG queries - -RAG queries are also called 'generative' queries in Weaviate. You can access these functions through the `generate` submodule of the collection object. - -Each generative query works in addition to the regular search query, and will perform a RAG query on each retrieved object. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/104_object_rag/20_single_prompt.mdx b/docs/academy/py/starter_custom_vectors/104_object_rag/20_single_prompt.mdx deleted file mode 100644 index 769d8b475..000000000 --- a/docs/academy/py/starter_custom_vectors/104_object_rag/20_single_prompt.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: "'Single prompt' generation" -description: Single Prompt in Object RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_rag.py'; - -A 'single prompt' generation will perform RAG queries on each retrieved object. This is useful when you want to transform each object separately, with the same prompt. - -### Code - -This example finds entries in "Movie" based on their similarity to the input vector. Then, instructs the large language model to translate the title of each movie into French. - -Each of the results are then printed out to the console. - - - -## Explain the code - -You must pass on one or more properties to the `single_prompt` parameter through braces, as we've done here with `"... {title} ..."`. This will instruct Weaviate to pass on the `title` property from each retrieved object to the large language model. - -
- Example results - -```text -In Time -À temps -Gattaca -Gattaca -I, Robot -Je, Robot -Mad Max: Fury Road -Mad Max: Fury Road -The Maze Runner -Le Labyrinthe -``` - -
- -### Response object - -Each response object is similar to that from a regular search query, with an additional `generated` attribute. This attribute will contain the generated output for each object. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/104_object_rag/30_grouped_task.mdx b/docs/academy/py/starter_custom_vectors/104_object_rag/30_grouped_task.mdx deleted file mode 100644 index 7a7a04d0c..000000000 --- a/docs/academy/py/starter_custom_vectors/104_object_rag/30_grouped_task.mdx +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: "'Grouped task' generation" -description: Grouped Task Setup for RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_rag.py'; - -A 'grouped task' generation will perform RAG queries on the set of retrieved objects. This is useful when you want to transform the set of objects as a whole, with one prompt. - -### Code - -This example finds entries in "Movie" based on their similarity to the input vector. Then, instructs the large language model to find commonalities between them. - -The generated text, and each of the results are then printed out to the console. - - - -## Explain the code - -For `grouped_task` queries, you simply pass on the prompt to the `grouped_task` parameter. This will instruct Weaviate to pass on the: -- text properties from all retrieved objects, and -- the prompt - -to the large language model. - -
- Example results - -```text -In Time -Gattaca -I, Robot -Mad Max: Fury Road -The Maze Runner -These movies all take place in a futuristic or dystopian society where the characters must navigate complex systems and face challenges related to technology, society, and survival. They all explore themes of control, power, and the consequences of scientific advancements on humanity. -``` - -
- -### Optional parameters - -You can also pass on a list of properties to be used, as the `grouped_properties` parameter. This can be useful to reduce the amount of data passed on to the large language model and omit irrelevant properties. - -### Response object - -A RAG query with the `grouped_task` parameter will return a response with an additional `generated` attribute. This attribute will contain the generated output for the set of objects. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/104_object_rag/index.mdx b/docs/academy/py/starter_custom_vectors/104_object_rag/index.mdx deleted file mode 100644 index 4ecedcf1d..000000000 --- a/docs/academy/py/starter_custom_vectors/104_object_rag/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: LLMs and Weaviate (RAG) -description: Learn to handle custom vectors in RAG workflows within Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/900_next_steps.mdx b/docs/academy/py/starter_custom_vectors/900_next_steps.mdx deleted file mode 100644 index aa424d650..000000000 --- a/docs/academy/py/starter_custom_vectors/900_next_steps.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Next steps ---- - -import IntroNextSteps from '../_snippets/intro_next_steps.mdx'; - - diff --git a/docs/academy/py/starter_custom_vectors/_snippets/101_connect.py b/docs/academy/py/starter_custom_vectors/_snippets/101_connect.py deleted file mode 100644 index 4db54d000..000000000 --- a/docs/academy/py/starter_custom_vectors/_snippets/101_connect.py +++ /dev/null @@ -1,159 +0,0 @@ -# WCDInstantiation -import weaviate -from weaviate.classes.init import Auth -import os - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) -# END WCDInstantiation - -client.close() - -# WCDAPIKeyInstantiation -import weaviate -from weaviate.classes.init import Auth -import os - -headers = { - "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY") -} # Replace with your Cohere API key - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) -# END WCDAPIKeyInstantiation - -client.close() - -# DockerInstantiation -import weaviate - -client = weaviate.connect_to_local() -# END DockerInstantiation - -# DockerAPIKeyInstantiation -import weaviate -import os - -headers = { - "X-Cohere-Api-Key": os.getenv("COHERE_APIKEY") -} # Replace with your Cohere API key - -client = weaviate.connect_to_local(headers=headers) -# END DockerAPIKeyInstantiation - - -# PollLiveness -assert client.is_live() # This will raise an exception if the client is not live -# END PollLiveness - - -# GetMeta -import json - -metainfo = client.get_meta() -print(json.dumps(metainfo, indent=2)) # Print the meta information in a readable format -# END GetMeta - - -""" -# OutputGetMeta -{ - "hostname": "http://[::]:8080", - "modules": { - "backup-gcs": { - "bucketName": "weaviate-wcs-prod-cust-europe-west2-workloads-backups", - "rootName": "8616b69e-f8d2-4547-ad92-70b9557591c0" - }, - "generative-aws": { - "documentationHref": "https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html", - "name": "Generative Search - AWS" - }, - "generative-cohere": { - "documentationHref": "https://docs.cohere.com/reference/generate", - "name": "Generative Search - Cohere" - }, - "generative-openai": { - "documentationHref": "https://platform.openai.com/docs/api-reference/completions", - "name": "Generative Search - OpenAI" - }, - "generative-palm": { - "documentationHref": "https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts", - "name": "Generative Search - Google PaLM" - }, - "qna-openai": { - "documentationHref": "https://platform.openai.com/docs/api-reference/completions", - "name": "OpenAI Question & Answering Module" - }, - "ref2vec-centroid": {}, - "reranker-cohere": { - "documentationHref": "https://txt.cohere.com/rerank/", - "name": "Reranker - Cohere" - }, - "text2vec-aws": { - "documentationHref": "https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings", - "name": "AWS Module" - }, - "text2vec-cohere": { - "documentationHref": "https://docs.cohere.ai/embedding-wiki/", - "name": "Cohere Module" - }, - "text2vec-huggingface": { - "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task", - "name": "Hugging Face Module" - }, - "text2vec-jinaai": { - "documentationHref": "https://jina.ai/embeddings/", - "name": "JinaAI Module" - }, - "text2vec-openai": { - "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings", - "name": "OpenAI Module" - }, - "text2vec-palm": { - "documentationHref": "https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings", - "name": "Google PaLM Module" - } - }, - "version": "1.23.8" -} -# END OutputGetMeta -""" - - -client.close() - - -# TryFinallyCloseDemo -import weaviate -from weaviate.classes.init import Auth -import os - -# END TryFinallyCloseDemo -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) -# TryFinallyCloseDemo -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local(...) - -try: - # Work with the client here - e.g.: - assert client.is_live() - pass - -finally: # This will always be executed, even if an exception is raised - client.close() # Close the connection & release resources -# END TryFinallyCloseDemo diff --git a/docs/academy/py/starter_custom_vectors/_snippets/102_collection.py b/docs/academy/py/starter_custom_vectors/_snippets/102_collection.py deleted file mode 100644 index 76b4f9b4e..000000000 --- a/docs/academy/py/starter_custom_vectors/_snippets/102_collection.py +++ /dev/null @@ -1,197 +0,0 @@ -# CreateMovieCollection -import weaviate -import os - -# CreateMovieCollection # SubmoduleImport -import weaviate.classes.config as wc - -# CreateMovieCollection # END SubmoduleImport - -# END CreateMovieCollection -from weaviate.classes.init import Auth - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) - -# CreateMovieCollection -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(..., headers=headers) or -# client = weaviate.connect_to_local(..., headers=headers) - -# END CreateMovieCollection - -# Actual instantiation - -client.collections.delete("MovieCustomVector") - -# CreateMovieCollection -client.collections.create( - name="MovieCustomVector", - properties=[ - wc.Property(name="title", data_type=wc.DataType.TEXT), - wc.Property(name="overview", data_type=wc.DataType.TEXT), - wc.Property(name="vote_average", data_type=wc.DataType.NUMBER), - wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY), - wc.Property(name="release_date", data_type=wc.DataType.DATE), - wc.Property(name="tmdb_id", data_type=wc.DataType.INT), - ], - # Define the vectorizer module (none, as we will add our own vectors) - vector_config=wc.Configure.Vectors.self_provided(), - # Define the generative module - generative_config=wc.Configure.Generative.cohere() - # END generativeDefinition # CreateMovieCollection -) - -client.close() -# END CreateMovieCollection - - -# See https://docs.cohere.com/reference/embed for further explanations -# ManuallyGenerateEmbeddings -import requests -import pandas as pd -import os -from typing import List -import cohere -from cohere import Client as CohereClient - -co_token = os.getenv("COHERE_APIKEY") -co = cohere.Client(co_token) - - -# Define a function to call the endpoint and obtain embeddings -def vectorize(cohere_client: CohereClient, texts: List[str]) -> List[List[float]]: - - response = cohere_client.embed( - texts=texts, model="embed-multilingual-v3.0", input_type="search_document" - ) - - return response.embeddings - - -# Get the source data -data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -resp = requests.get(data_url) -df = pd.DataFrame(resp.json()) - -# Loop through the dataset to generate vectors in batches -emb_dfs = list() -src_texts = list() -for i, row in enumerate(df.itertuples(index=False)): - # Concatenate text to create a source string - src_text = "Title" + row.title + "; Overview: " + row.overview - # Add to the buffer - src_texts.append(src_text) - if (len(src_texts) == 50) or (i + 1 == len(df)): # Get embeddings in batches of 50 - # Get a batch of embeddings - output = vectorize(co, src_texts) - index = list(range(i - len(src_texts) + 1, i + 1)) - emb_df = pd.DataFrame(output, index=index) - # Add the batch of embeddings to a list - emb_dfs.append(emb_df) - # Reset the buffer - src_texts = list() - - -emb_df = pd.concat(emb_dfs) # Create a combined dataset - -# Save the data as a CSV -os.makedirs("scratch", exist_ok=True) # Create a folder if it doesn't exist -emb_df.to_csv( - f"scratch/movies_data_1990_2024_embeddings.csv", - index=False, -) -# END ManuallyGenerateEmbeddings - -assert len(emb_df) == len(df) -assert type(output[0]) == list - - -# BatchImportData -import weaviate -import pandas as pd -import requests -from datetime import datetime, timezone -import json -from weaviate.util import generate_uuid5 -from tqdm import tqdm -import os - -# END BatchImportData -headers = {"X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")} - -from weaviate.classes.init import Auth - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) - -# BatchImportData -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local(...) - -# END BatchImportData - -# BatchImportData -data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -data_resp = requests.get(data_url) -df = pd.DataFrame(data_resp.json()) - -# Load the embeddings (embeddings from the previous step) -embs_path = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_embeddings.csv" -# Or load embeddings from a local file (if you generated them earlier) -# embs_path = "scratch/movies_data_1990_2024_embeddings.csv" - -emb_df = pd.read_csv(embs_path) - -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Enter context manager -with movies.batch.fixed_size(batch_size=200) as batch: - # Loop through the data - for i, movie in enumerate(df.itertuples(index=False)): - # Convert data types - # Convert a JSON date to `datetime` and add time zone information - release_date = datetime.strptime(movie.release_date, "%Y-%m-%d").replace( - tzinfo=timezone.utc - ) - # Convert a JSON array to a list of integers - genre_ids = json.loads(movie.genre_ids) - - # Build the object payload - movie_obj = { - "title": movie.title, - "overview": movie.overview, - "vote_average": movie.vote_average, - "genre_ids": genre_ids, - "release_date": release_date, - "tmdb_id": movie.id, - } - - # Get the vector - vector = emb_df.iloc[i].to_list() - - # Add object (including vector) to batch queue - batch.add_object( - properties=movie_obj, - uuid=generate_uuid5(movie.id), - vector=vector # Add the custom vector - # references=reference_obj # You can add references here - ) - # Batcher automatically sends batches - -# Check for failed objects -if len(movies.batch.failed_objects) > 0: - print(f"Failed to import {len(movies.batch.failed_objects)} objects") - -client.close() diff --git a/docs/academy/py/starter_custom_vectors/_snippets/103_10_vector.py b/docs/academy/py/starter_custom_vectors/_snippets/103_10_vector.py deleted file mode 100644 index 01c0626a6..000000000 --- a/docs/academy/py/starter_custom_vectors/_snippets/103_10_vector.py +++ /dev/null @@ -1,76 +0,0 @@ -# MetadataSemanticSearch -import weaviate -import weaviate.classes.query as wq -import os - -# END MetadataSemanticSearch - -# MetadataSemanticSearch - -# END MetadataSemanticSearch - -from weaviate.classes.init import Auth - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) - -# MetadataSemanticSearch -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local(...) - -# END MetadataSemanticSearch - - -# GetQueryVector # MetadataSemanticSearch -# Define a function to call the endpoint and obtain embeddings -from typing import List -import os -import cohere -from cohere import Client as CohereClient - -co_token = os.getenv("COHERE_APIKEY") -co = cohere.Client(co_token) - - -# Define a function to call the endpoint and obtain embeddings -def vectorize(cohere_client: CohereClient, texts: List[str]) -> List[List[float]]: - - response = cohere_client.embed( - texts=texts, model="embed-multilingual-v3.0", input_type="search_document" - ) - - return response.embeddings - - -query_text = "dystopian future" -query_vector = vectorize(co, [query_text])[0] -# END GetQueryVector # END MetadataSemanticSearch - - -# MetadataSemanticSearch -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Perform query -response = movies.query.near_vector( - near_vector=query_vector, # A list of floating point numbers - limit=5, - return_metadata=wq.MetadataQuery(distance=True), -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END MetadataSemanticSearch diff --git a/docs/academy/py/starter_custom_vectors/_snippets/103_20_searches.py b/docs/academy/py/starter_custom_vectors/_snippets/103_20_searches.py deleted file mode 100644 index a1e18ab7f..000000000 --- a/docs/academy/py/starter_custom_vectors/_snippets/103_20_searches.py +++ /dev/null @@ -1,147 +0,0 @@ -# START-ANY -import weaviate -import weaviate.classes.query as wq -import os - -# END-ANY - -# FilteredSemanticSearch -from datetime import datetime - -# END FilteredSemanticSearch - -# START-ANY - -# END-ANY - -from weaviate.classes.init import Auth - -headers = {"X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")} - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")} # Replace with your Cohere API key -# client = weaviate.connect_to_weaviate_cloud(..., headers=headers) or -# client = weaviate.connect_to_local(..., headers=headers) - -# END-ANY - - -# START-ANY -# Define a function to call the endpoint and obtain embeddings -from typing import List -import os -import cohere -from cohere import Client as CohereClient - -co_token = os.getenv("COHERE_APIKEY") -co = cohere.Client(co_token) - - -# Define a function to call the endpoint and obtain embeddings -def vectorize(cohere_client: CohereClient, texts: List[str]) -> List[List[float]]: - - response = cohere_client.embed( - texts=texts, model="embed-multilingual-v3.0", input_type="search_document" - ) - - return response.embeddings - - -# END-ANY - - -query_text = "history" -query_vector = vectorize(co, [query_text])[0] - -# MetadataBM25Search -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Perform query -response = movies.query.bm25( - query="history", limit=5, return_metadata=wq.MetadataQuery(score=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"BM25 score: {o.metadata.score:.3f}\n" - ) # Print the BM25 score of the object from the query - -client.close() -# END MetadataBM25Search - - -print("\n\n") - -client.connect() - -# MetadataHybridSearch -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Perform query -response = movies.query.hybrid( - query="history", # For BM25 part of the hybrid search - vector=query_vector, # For vector part of the hybrid search - limit=5, - return_metadata=wq.MetadataQuery(score=True), -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Hybrid score: {o.metadata.score:.3f}\n" - ) # Print the hybrid search score of the object from the query - -client.close() -# END MetadataHybridSearch - - -print("\n\n") - -client.connect() - -query_text = "history" -query_vector = vectorize(co, [query_text])[0] - -# FilteredSemanticSearch -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Perform query -response = movies.query.near_vector( - near_vector=query_vector, - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - # highlight-start - filters=wq.Filter.by_property("release_date").greater_than(datetime(2020, 1, 1)) - # highlight-end -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END FilteredSemanticSearch diff --git a/docs/academy/py/starter_custom_vectors/_snippets/104_rag.py b/docs/academy/py/starter_custom_vectors/_snippets/104_rag.py deleted file mode 100644 index 74cd1e824..000000000 --- a/docs/academy/py/starter_custom_vectors/_snippets/104_rag.py +++ /dev/null @@ -1,106 +0,0 @@ -# GetQueryVector -# Define a function to call the endpoint and obtain embeddings -from typing import List -import os -import cohere -from cohere import Client as CohereClient - -co_token = os.getenv("COHERE_APIKEY") -co = cohere.Client(co_token) - - -# Define a function to call the endpoint and obtain embeddings -def vectorize(cohere_client: CohereClient, texts: List[str]) -> List[List[float]]: - - response = cohere_client.embed( - texts=texts, model="embed-multilingual-v3.0", input_type="search_document" - ) - - return response.embeddings - - -# END GetQueryVector - - -# START-ANY -import os -import weaviate -import os - -# END-ANY - -from weaviate.classes.init import Auth - -headers = {"X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")} - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-Cohere-Api-Key": os.getenv("COHERE_APIKEY")} # Replace with your Cohere API key -# client = weaviate.connect_to_weaviate_cloud(..., headers=headers) or -# client = weaviate.connect_to_local(..., headers=headers) - -# END-ANY - -query_text = "dystopian future" -query_vector = vectorize(co, [query_text])[0] - -# SinglePromptGeneration -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Perform query -response = movies.generate.near_vector( - near_vector=query_vector, - limit=5, - # highlight-start - single_prompt="Translate this into French: {title}" - # highlight-end -) - -# Inspect the response -for o in response.objects: - # highlight-start - print(o.properties["title"]) # Print the title - # highlight-end - print(o.generated) # Print the generated text (the title, in French) - -client.close() -# END SinglePromptGeneration - - -print("\n\n") - -client.connect() - - -# GroupedTaskGeneration -# Get the collection -movies = client.collections.use("MovieCustomVector") - -# Perform query -response = movies.generate.near_vector( - near_vector=query_vector, - limit=5, - # highlight-start - grouped_task="What do these movies have in common?", - # grouped_properties=["title", "overview"] # Optional parameter; for reducing prompt length - # highlight-end -) - -# Inspect the response -for o in response.objects: - print(o.properties["title"]) # Print the title -# highlight-start -print(response.generative.text) # Print the generated text (the commonalities between them) -# highlight-end - -client.close() -# END GroupedTaskGeneration diff --git a/docs/academy/py/starter_custom_vectors/index.md b/docs/academy/py/starter_custom_vectors/index.md deleted file mode 100644 index 7a0aea1b9..000000000 --- a/docs/academy/py/starter_custom_vectors/index.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: "101V Work with: Your own vectors" -description: Start using custom vectors in Weaviate for tailored data search in Python. -sidebar_position: 101 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -In this project-based course, you will learn how to work with any data and your own vectors, using Weaviate and a pre-vectorized movie dataset. - -You will get hands-on experience on how to store and index objects and corresponding vectors. You will learn how to search through that data using vector, keyword and hybrid searches, as well as filters. You will also learn how to use Weaviate's retrieval augmented generation (RAG) capabilities to generate outputs based on the retrieved objects. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/10_client.mdx b/docs/academy/py/starter_multimodal_data/101_setup_weaviate/10_client.mdx deleted file mode 100644 index a8c8f264a..000000000 --- a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/10_client.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Weaviate Python client -description: Client Setup for Multimodal Data in Weaviate ---- - -## Installation - -The latest Weaviate Python client library can be installed using pip. The client library is tested on Python 3.8 and later. Install it using the following command: - -```bash -pip install -U weaviate-client -``` - -The latest major version is `v4` (e.g. `4.x.x`). You can check the version like so: - -```bash -pip show weaviate-client -``` - -## Basic usage - -From Python, you can load the Weaviate client library like so: - -```python -import weaviate -``` - -The client provides sets of helper classes (e.g. under `weaviate.classes`) and functions to make it easier to interact with Weaviate. - -Next, we'll show you how create a Weaviate instance and connect to it. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/20_create_docker.mdx b/docs/academy/py/starter_multimodal_data/101_setup_weaviate/20_create_docker.mdx deleted file mode 100644 index 966081a27..000000000 --- a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/20_create_docker.mdx +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: Create a local Docker instance ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/101_connect.py'; - -:::note Can I use a cloud instance? -Generating multimodal vectors is currently only possible with local models, and as a result this course uses a local, Docker instance of Weaviate. If you are generating vectors outside of Weaviate, you can use a cloud instance. See the [Work with: your own vectors](../../starter_custom_vectors/index.md) course for more information. -::: - -Here, you will create a Weaviate instance and a multi-modal vectorizer container using Docker. - -### Download and run the docker-compose file - -Install Docker on your machine. We recommend following the [official Docker installation guide](https://docs.docker.com/get-docker/). - -Create a new directory and navigate to it in your terminal. Then, create a new file called `docker-compose.yml` and add the following content: - -```yaml ---- -services: - weaviate: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - volumes: - - weaviate_data:/var/lib/weaviate - restart: on-failure:0 - environment: - CLIP_INFERENCE_API: 'http://multi2vec-clip:8080' - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ENABLE_MODULES: 'multi2vec-clip' - ENABLE_API_BASED_MODULES: 'true' - CLUSTER_HOSTNAME: 'node1' - multi2vec-clip: - image: cr.weaviate.io/semitechnologies/multi2vec-clip:sentence-transformers-clip-ViT-B-32-multilingual-v1 - environment: - ENABLE_CUDA: '0' -volumes: - weaviate_data: -... - -``` - -### Create a Weaviate instance - -Run the following command to start Weaviate: - -```bash -docker compose up -``` - -### Your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -### Connect to your Weaviate instance - -To connect to the Weaviate instance, use the `connect_to_local` function. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses OpenAI (for retrieval augmented generation), so you can provide the OpenAI API key to Weaviate through `headers={"X-OpenAI-Api-Key": }` as shown below: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/30_communicate.mdx b/docs/academy/py/starter_multimodal_data/101_setup_weaviate/30_communicate.mdx deleted file mode 100644 index 0d2afa4bb..000000000 --- a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/30_communicate.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: Communicate with Weaviate -description: Communication Setup for Multimodal Data ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/101_connect.py'; - -Here, we'll perform basic operations to communicate with Weaviate using the Python client library. - -### Check Weaviate status - -You can check whether the Weaviate instance is up using the `is_live` function. - - - -### Retrieve server meta information - -You can retrieve meta information about the Weaviate instance using the `meta` function. - - - -This will print the server meta information to the console. The output will look similar to the following: - -
- Example get_meta output - -Note that this output is a little longer due to the additional details from the CLIP models. - - -
- -### Close the connection - -After you have finished using the Weaviate client, you should close the connection. This frees up resources and ensures that the connection is properly closed. - -We suggest using a `try`-`finally` block as a best practice. For brevity, we will not include the `try`-`finally` blocks in the remaining code snippets. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/index.mdx b/docs/academy/py/starter_multimodal_data/101_setup_weaviate/index.mdx deleted file mode 100644 index e578418c5..000000000 --- a/docs/academy/py/starter_multimodal_data/101_setup_weaviate/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Set up Weaviate -description: Setup guide for Weaviate with multimodal data handling capabilities. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/102_mm_collections/10_preparation.mdx b/docs/academy/py/starter_multimodal_data/102_mm_collections/10_preparation.mdx deleted file mode 100644 index 18cdd26c3..000000000 --- a/docs/academy/py/starter_multimodal_data/102_mm_collections/10_preparation.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Preparation ---- - -In this section you are going to populate your Weaviate instance with a movie dataset, using the multi-modal, CLIP models to embed the text and image data. - -### Weaviate instance - -Make sure to have your Weaviate instance set up. You should have [created an instance](../101_setup_weaviate/20_create_docker.mdx) and be able to connect to it. - -### Source data - -We are going to use a movie dataset sourced from [TMDB](https://www.themoviedb.org/). The dataset can be found in this [GitHub repository](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json), and it contains bibliographic information on ~700 movies released between 1990 and 2024. - -As a multimodal project, we'll also use [corresponding posters for each movie](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_posters.zip), which are available in the same repository. - -
- See sample text data - -| | backdrop_path | genre_ids | id | original_language | original_title | overview | popularity | poster_path | release_date | title | video | vote_average | vote_count | -|---:|:---------------------------------|:----------------|-----:|:--------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------:|:---------------------------------|:---------------|:----------------------------|:--------|---------------:|-------------:| -| 0 | /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg | [14, 18, 10749] | 162 | en | Edward Scissorhands | A small suburban town receives a visit from a castaway unfinished science experiment named Edward. | 45.694 | /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg | 1990-12-07 | Edward Scissorhands | False | 7.7 | 12305 | -| 1 | /sw7mordbZxgITU877yTpZCud90M.jpg | [18, 80] | 769 | en | GoodFellas | The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway. | 57.228 | /aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg | 1990-09-12 | GoodFellas | False | 8.5 | 12106 | -| 2 | /6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg | [35, 10751] | 771 | en | Home Alone | Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. But when a pair of bungling burglars set their sights on Kevin's house, the plucky kid stands ready to defend his territory. By planting booby traps galore, adorably mischievous Kevin stands his ground as his frantic mother attempts to race home before Christmas Day. | 3.538 | /onTSipZ8R3bliBdKfPtsDuHTdlL.jpg | 1990-11-16 | Home Alone | False | 7.4 | 10599 | -| 3 | /vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg | [12, 35, 878] | 196 | en | Back to the Future Part III | The final installment of the Back to the Future trilogy finds Marty digging the trusty DeLorean out of a mineshaft and looking for Doc in the Wild West of 1885. But when their time machine breaks down, the travelers are stranded in a land of spurs. More problems arise when Doc falls for pretty schoolteacher Clara Clayton, and Marty tangles with Buford Tannen. | 28.896 | /crzoVQnMzIrRfHtQw0tLBirNfVg.jpg | 1990-05-25 | Back to the Future Part III | False | 7.5 | 9918 | -| 4 | /3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg | [35, 10749] | 114 | en | Pretty Woman | When a millionaire wheeler-dealer enters a business contract with a Hollywood hooker Vivian Ward, he loses his heart in the bargain. | 97.953 | /hVHUfT801LQATGd26VPzhorIYza.jpg | 1990-03-23 | Pretty Woman | False | 7.5 | 7671 | - -
- -Next, you will create a corresponding object collection and import the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/102_mm_collections/20_create_collection.mdx b/docs/academy/py/starter_multimodal_data/102_mm_collections/20_create_collection.mdx deleted file mode 100644 index c83f65540..000000000 --- a/docs/academy/py/starter_multimodal_data/102_mm_collections/20_create_collection.mdx +++ /dev/null @@ -1,91 +0,0 @@ ---- -title: Create a collection -description: Creating Multimodal Data Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -Weaviate stores data in "collections". A collection is a set of objects that share the same data structure. In our movie database, we might have a collection of movies, a collection of actors, and a collection of reviews. - -Here we will create a collection of movies. - -## Code - -This example creates a collection for the movie data: - - - -Each collection definition must have a name. Then, you can define additional parameters like we've done in this example. - -## Explain the code - -### Properties - -Properties are the object attributes that you want to store in the collection. Each property has a name and a data type. - -In our movie database, we have properties like `title`, `release_date` and `genre_ids`, with data types like `TEXT` (string), `DATE` (date), or `INT` (integer). It's also possible to have arrays of integers, like we have with `genre_ids`. - -As a multimodal object, we also have the `poster` property which is the image data, which is saved as a `BLOB` (binary large object) data type. - -#### Auto-schema - -Weaviate can automatically [infer the schema](/weaviate/config-refs/collections.mdx#auto-schema) from the data. However, it's a good practice to define the properties explicitly, for better control and to avoid surprises. - -### Vectorizer configuration - -If you do not specify the vector yourself, Weaviate will use a specified vectorizer to generate vector embeddings from your data. - -In this code example, we specify the `multi2vec-clip` module. This module uses the CLIP model to generate vector embeddings from the text and image data. - -You can specify any number of text and image properties to be used for vectorization, and weight them differently. The weights are used to determine the relative importance of each property in the vector embedding generation process. In this example, we vectorize the `poster` property (an image) with a 90% weight and the `title` property (a string) with a 10% weight. - - - -### Generative configuration - -If you wish to use your collection with a generative model (e.g. a large language model), you must specify the generative module. - -In this code example, we specify the `openai` module (`generative-openai` is the full name) with default options. - - - -import MutableGenerativeConfig from '/_includes/mutable-generative-config.md'; - - - -### Python classes - -The code example makes use of classes such as `Property`, `DataType` and `Configure`. They are defined in the `weaviate.classes.config` submodule and are used to define the collection. - -For convenience, we import the submodule as `wc` and use classes from it. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/102_mm_collections/30_import_data.mdx b/docs/academy/py/starter_multimodal_data/102_mm_collections/30_import_data.mdx deleted file mode 100644 index 45f59ba63..000000000 --- a/docs/academy/py/starter_multimodal_data/102_mm_collections/30_import_data.mdx +++ /dev/null @@ -1,111 +0,0 @@ ---- -title: Import data -description: Importing Data into Multimodal Data Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -## Code - -This example imports the movie data into our collection. - - - -The code: -- Loads the source text and image data -- Gets the collection -- Enters a context manager with a batcher (`batch`) object -- Loops through the data and: - - Finds corresponding image to the text - - Converts the image to base64 - - Adds objects to the batcher -- Prints out any import errors - -## Explain the code - -### Preparation - -We use the requests library to load the data from the source, in this case a JSON file containing text data and a Zip file containing posters. The text data is then converted to a Pandas DataFrame for easier manipulation and the images are extracted from the Zip file. - -Then, we create a collection object (with `client.collections.get`) so we can interact with the collection. - -### Batch context manager - -The `batch` object is a context manager that allows you to add objects to the batcher. This is useful when you have a large amount of data to import, as it abstracts away the complexity of managing the batch size and when to send the batch. - - - -This example uses the `.fixed_size()` method to create a batcher which sets the number of objects per batch. There are also other batcher types, like `.rate_limit()` for specifying the number of objects per minute and `.dynamic()` to create a dynamic batcher, which automatically determines and updates the batch size during the import process. - -### Add data to the batcher - -#### Convert data types - -The data is converted from a string to the correct data types for Weaviate. For example, the `release_date` is converted to a datetime object, and the `genre_ids` are converted to a list of integers. - - - -To save the image data as a `BLOB` (binary large object) data type, we convert the image to base64. - - - -#### Add objects to the batcher - -Then we loop through the data and add each object to the batcher. The `batch.add_object` method is used to add the object to the batcher, and the batcher will send the batch according to the specified batcher type. - - - -### Error handling - -Because a batch includes multiple objects, it's possible that some objects will fail to import. The batcher saves these errors. - -You can print out the errors to see what went wrong, and then decide how to handle them, such as by raising an exception. In this example, we simply print out the errors. - - - -Note that the list of errors is cleared when a new context manager is entered, so you must handle the errors before initializing a new batcher. - -## Where do the vectors come from? - -When the batcher sends the queue to Weaviate, the objects are added to the collection. In our case, the movie collection. - -Recall that the collection has a vectorizer module, and we do not specify vectors here. So Weaviate uses the specified vectorizer to generate vector embeddings from the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/102_mm_collections/index.mdx b/docs/academy/py/starter_multimodal_data/102_mm_collections/index.mdx deleted file mode 100644 index 0bf6d4919..000000000 --- a/docs/academy/py/starter_multimodal_data/102_mm_collections/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Populate the database -description: Handle multimodal collections efficiently with Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/103_mm_searches/10_multimodal.mdx b/docs/academy/py/starter_multimodal_data/103_mm_searches/10_multimodal.mdx deleted file mode 100644 index 09956df43..000000000 --- a/docs/academy/py/starter_multimodal_data/103_mm_searches/10_multimodal.mdx +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: Multimodal search -description: Multimodal Search Methodology ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -With Weaviate, you can perform semantic searches to find similar items based on their meaning. This is done by comparing the vector embeddings of the items in the database. - -As we are using a multimodal model, we can search for objects based on their similarity to any of the supported modalities. Meaning that we can search for movies based on their similarity to a text or an image. - -## Image query - -### Code - -This example finds entries in "MovieMM" based on their similarity to [this image of the International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg), and prints out the title and release year of the top 5 matches. - -
- Query image - -![International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg) - -
- - - -### Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object. In this case, the vectorizer module generates an embedding of the input image. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the vector distance to the query. - -Note that the results are very similar to the tone of the query image, as the top results are all space-themed movies. - -
- Example results - -Posters for the top 5 matches: -Interstellar -Gravity -Arrival -Armageddon -Godzilla - -Weaviate output: - -```text -Interstellar 2014 157336 -Distance to query: 0.354 - -Gravity 2013 49047 -Distance to query: 0.384 - -Arrival 2016 329865 -Distance to query: 0.386 - -Armageddon 1998 95 -Distance to query: 0.400 - -Godzilla 1998 929 -Distance to query: 0.441 -``` - -
- -### Response object - -The returned object is an instance of a custom class. Its `objects` attribute is a list of search results, each object being an instance of another custom class. - -Each returned object will: -- Include all properties and its UUID by default except those with blob data types. - - Since the `poster` property is a blob, it is not included by default. - - To include the `poster` property, you must specify it and the other properties to fetch in the `return_properties` parameter. -- Not include any other information (e.g. references, metadata, vectors.) by default. - - -## Text search - -### Code - -This example finds entries in "MovieMM" based on their similarity to the query "red", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object. In this case, the vectorizer module generates an embedding of the input text. - -The remaining parameters are the same as in the previous example. - -Note that the results actually include movies with red color themes in its poster. This is because the CLIP vectorizer encodes the color information of the image in the vectors. - -
- Example results - -Posters for the top 5 matches: -Deadpool 2 -Bloodshot -Deadpool -300 -The Hunt for Red October - -Weaviate output: - -```text -Deadpool 2 2018 383498 -Distance to query: 0.670 - -Bloodshot 2020 338762 -Distance to query: 0.677 - -Deadpool 2016 293660 -Distance to query: 0.678 - -300 2007 1271 -Distance to query: 0.682 - -The Hunt for Red October 1990 1669 -Distance to query: 0.683 -``` - -
- -### Response object - -The returned object is in the same format as in the previous example. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/103_mm_searches/20_keyword_hybrid.mdx b/docs/academy/py/starter_multimodal_data/103_mm_searches/20_keyword_hybrid.mdx deleted file mode 100644 index 9ce7085d6..000000000 --- a/docs/academy/py/starter_multimodal_data/103_mm_searches/20_keyword_hybrid.mdx +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: Keyword & Hybrid search -description: Hybrid Keyword Searches in Multimodal Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -You can also perform keyword (BM25) searches to find items based on their keyword similarity, or hybrid searches that combine BM25 and semantic/vector searches. - -## Keyword search - -### Code - -This example finds entries in "MovieMM" with the highest keyword search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a keyword search score using what's called the [BM25f](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the `score`, which is the BM25 score of the result. - -
- Example results - -```text -American History X 1998 -BM25 score: 2.707 - -A Beautiful Mind 2001 -BM25 score: 1.896 - -Legends of the Fall 1994 -BM25 score: 1.663 - -Hacksaw Ridge 2016 -BM25 score: 1.554 - -Night at the Museum 2006 -BM25 score: 1.529 -``` - -
- - -## Hybrid search - -### Code - -This example finds entries in "MovieMM" with the highest hybrid search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a hybrid search score. A hybrid search blends results of BM25 and semantic/vector searches. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the `score`, which is the hybrid score of the result. - -
- Example results - -```text -Legends of the Fall 1994 -Hybrid score: 0.016 - -Hacksaw Ridge 2016 -Hybrid score: 0.016 - -A Beautiful Mind 2001 -Hybrid score: 0.015 - -The Butterfly Effect 2004 -Hybrid score: 0.015 - -Night at the Museum 2006 -Hybrid score: 0.012 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/103_mm_searches/30_filters.mdx b/docs/academy/py/starter_multimodal_data/103_mm_searches/30_filters.mdx deleted file mode 100644 index 4ebbf6246..000000000 --- a/docs/academy/py/starter_multimodal_data/103_mm_searches/30_filters.mdx +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Filters -description: Filters for Multimodal Searches ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -Filters can be used to precisely refine search results. You can filter by properties as well as metadata, and you can combine multiple filters with `and` or `or` conditions to further narrow down the results. - -### Code - -This example finds entries in "MovieMM" based on their similarity to the query "dystopian future", only from those released after 2010. It prints out the title and release year of the top 5 matches. - - - -## Explain the code - -This query is identical to [that shown earlier](./10_multimodal.mdx) for search, but with the addition of a filter. The `filters` parameter here takes an instance of the `Filter` class to set the filter conditions. The current query filters the results to only include those with a release year after 2010. - -
- Example results - -```text -Dune 2021 -Distance to query: 0.199 - -Tenet 2020 -Distance to query: 0.200 - -Mission: Impossible - Dead Reckoning Part One 2023 -Distance to query: 0.207 - -Onward 2020 -Distance to query: 0.214 - -Jurassic World Dominion 2022 -Distance to query: 0.216 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/103_mm_searches/index.mdx b/docs/academy/py/starter_multimodal_data/103_mm_searches/index.mdx deleted file mode 100644 index 649333b2d..000000000 --- a/docs/academy/py/starter_multimodal_data/103_mm_searches/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Perform searches -description: Learn multimodal search techniques with Weaviate's Python client. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/104_mm_rag/10_setup.mdx b/docs/academy/py/starter_multimodal_data/104_mm_rag/10_setup.mdx deleted file mode 100644 index 3d57e77aa..000000000 --- a/docs/academy/py/starter_multimodal_data/104_mm_rag/10_setup.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "RAG: Overview" -description: Setting up Multimodal RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -### Motivation - -Retrieval augmented generation (RAG) is a way to combine the best of both worlds: the retrieval capabilities of semantic search and the generation capabilities of AI models such as large language models. This allows you to retrieve objects from a Weaviate instance and then generate outputs based on the retrieved objects. - -### Setup - -When we created a collection, we specified the `generative_module` parameter as shown here. - - - -This selects a generative module that will be used to generate outputs based on the retrieved objects. In this case, we're using the `openai` module, and the `GPT` family of large language models. - -As we did before with the vectorizer module, you will require an API key from the provider of the generative module. In this case, you will need an API key from OpenAI. - -### RAG queries - -RAG queries are also called 'generative' queries in Weaviate. You can access these functions through the `generate` submodule of the collection object. - -Each generative query works in addition to the regular search query, and will perform a RAG query on each retrieved object. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/104_mm_rag/20_single_prompt.mdx b/docs/academy/py/starter_multimodal_data/104_mm_rag/20_single_prompt.mdx deleted file mode 100644 index 613bdc584..000000000 --- a/docs/academy/py/starter_multimodal_data/104_mm_rag/20_single_prompt.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: "'Single prompt' generation" -description: Single Prompt for Multimodal RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_rag.py'; - -A 'single prompt' generation will perform RAG queries on each retrieved object. This is useful when you want to transform each object separately, with the same prompt. - -### Code - -This example finds entries in "MovieMM" based on their similarity to [this image of the International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg). Then, instructs the large language model to translate the title of each movie into French. - -Each of the results are then printed out to the console. - - - -## Explain the code - -You must pass on one or more properties to the `single_prompt` parameter through braces, as we've done here with `"... {title} ..."`. This will instruct Weaviate to pass on the `title` property from each retrieved object to the large language model. - -
- Example results - -```text -Interstellar -Interstellaire -Gravity -Gravité -Arrival -Arrivée -Armageddon -Armageddon -Godzilla -Godzilla -``` - -
- -### Response object - -Each response object is similar to that from a regular search query, with an additional `generated` attribute. This attribute will contain the generated output for each object. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/104_mm_rag/30_grouped_task.mdx b/docs/academy/py/starter_multimodal_data/104_mm_rag/30_grouped_task.mdx deleted file mode 100644 index eaefdf259..000000000 --- a/docs/academy/py/starter_multimodal_data/104_mm_rag/30_grouped_task.mdx +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: "'Grouped task' generation" -description: Grouped Task for Multimodal RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_rag.py'; - -A 'grouped task' generation will perform RAG queries on the set of retrieved objects. This is useful when you want to transform the set of objects as a whole, with one prompt. - -### Code - -This example finds entries in "MovieMM" based on their similarity to [this image of the International Space Station](https://upload.wikimedia.org/wikipedia/commons/thumb/0/04/International_Space_Station_after_undocking_of_STS-132.jpg/440px-International_Space_Station_after_undocking_of_STS-132.jpg). Then, instructs the large language model to find commonalities between them. - -Each of the results are then printed out to the console. - - - -## Explain the code - -For `grouped_task` queries, you simply pass on the prompt to the `grouped_task` parameter. This will instruct Weaviate to pass on the: -- text properties from all retrieved objects, and -- the prompt - -to the large language model. - -
- Example results - -```text -Interstellar -Gravity -Arrival -Armageddon -Godzilla -These movies all involve space exploration, extraterrestrial beings, or catastrophic events threatening Earth. They all deal with themes of survival, human ingenuity, and the unknown mysteries of the universe. -``` - -
- -### Optional parameters - -You can also pass on a list of properties to be used, as the `grouped_properties` parameter. This can be useful to reduce the amount of data passed on to the large language model and omit irrelevant properties. - -### Response object - -A RAG query with the `grouped_task` parameter will return a response with an additional `generated` attribute. This attribute will contain the generated output for the set of objects. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/104_mm_rag/index.mdx b/docs/academy/py/starter_multimodal_data/104_mm_rag/index.mdx deleted file mode 100644 index 4ebbddf9e..000000000 --- a/docs/academy/py/starter_multimodal_data/104_mm_rag/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: LLMs and Weaviate (RAG) -description: Multimodal Retrieval-Augmented Generation (RAG) Overview ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/900_next_steps.mdx b/docs/academy/py/starter_multimodal_data/900_next_steps.mdx deleted file mode 100644 index aa424d650..000000000 --- a/docs/academy/py/starter_multimodal_data/900_next_steps.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Next steps ---- - -import IntroNextSteps from '../_snippets/intro_next_steps.mdx'; - - diff --git a/docs/academy/py/starter_multimodal_data/_snippets/101_connect.py b/docs/academy/py/starter_multimodal_data/_snippets/101_connect.py deleted file mode 100644 index 273cbc573..000000000 --- a/docs/academy/py/starter_multimodal_data/_snippets/101_connect.py +++ /dev/null @@ -1,523 +0,0 @@ -# DockerInstantiation -import weaviate - -client = weaviate.connect_to_local() -# END DockerInstantiation - -client.close() - -# DockerAPIKeyInstantiation -import weaviate -import os - -headers = { - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") -} # Replace with your OpenAI API key - -client = weaviate.connect_to_local(headers=headers) -# END DockerAPIKeyInstantiation - -# PollLiveness -assert client.is_live() # This will raise an exception if the client is not live -# END PollLiveness - - -# GetMeta -import json - -metainfo = client.get_meta() -print(json.dumps(metainfo, indent=2)) # Print the meta information in a readable format -# END GetMeta - - -""" -# OutputGetMeta -{ - "hostname": "http://[::]:8080", - "modules": { - "multi2vec-clip": { - "clip_model": { - "_commit_hash": null, - "_name_or_path": "/root/.cache/torch/sentence_transformers/sentence-transformers_clip-ViT-B-32/0_CLIPModel", - "add_cross_attention": false, - "architectures": [ - "CLIPModel" - ], - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1, - "logit_scale_init_value": 2.6592, - "max_length": 20, - "min_length": 0, - "model_type": "clip", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "text_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": 0, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": 2, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 512, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 2048, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "max_position_embeddings": 77, - "min_length": 0, - "model_type": "clip_text_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 8, - "num_beam_groups": 1, - "num_beams": 1, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": 1, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false, - "vocab_size": 49408 - }, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": "torch.float32", - "torchscript": false, - "transformers_version": null, - "typical_p": 1, - "use_bfloat16": false, - "vision_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 768, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "image_size": 224, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 3072, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "min_length": 0, - "model_type": "clip_vision_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 12, - "num_beam_groups": 1, - "num_beams": 1, - "num_channels": 3, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "patch_size": 32, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false - } - }, - "text_model": { - "_commit_hash": null, - "_name_or_path": "./models/text/0_CLIPModel", - "add_cross_attention": false, - "architectures": [ - "CLIPModel" - ], - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "length_penalty": 1, - "logit_scale_init_value": 2.6592, - "max_length": 20, - "min_length": 0, - "model_type": "clip", - "no_repeat_ngram_size": 0, - "num_beam_groups": 1, - "num_beams": 1, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "text_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": 0, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": 2, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 512, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 2048, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "max_position_embeddings": 77, - "min_length": 0, - "model_type": "clip_text_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 8, - "num_beam_groups": 1, - "num_beams": 1, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": 1, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false, - "vocab_size": 49408 - }, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": "torch.float32", - "torchscript": false, - "transformers_version": null, - "typical_p": 1, - "use_bfloat16": false, - "vision_config": { - "_name_or_path": "", - "add_cross_attention": false, - "architectures": null, - "attention_dropout": 0, - "bad_words_ids": null, - "begin_suppress_tokens": null, - "bos_token_id": null, - "chunk_size_feed_forward": 0, - "cross_attention_hidden_size": null, - "decoder_start_token_id": null, - "diversity_penalty": 0, - "do_sample": false, - "dropout": 0, - "early_stopping": false, - "encoder_no_repeat_ngram_size": 0, - "eos_token_id": null, - "exponential_decay_length_penalty": null, - "finetuning_task": null, - "forced_bos_token_id": null, - "forced_eos_token_id": null, - "gradient_checkpointing": false, - "hidden_act": "quick_gelu", - "hidden_size": 768, - "id2label": { - "0": "LABEL_0", - "1": "LABEL_1" - }, - "image_size": 224, - "initializer_factor": 1, - "initializer_range": 0.02, - "intermediate_size": 3072, - "is_decoder": false, - "is_encoder_decoder": false, - "label2id": { - "LABEL_0": 0, - "LABEL_1": 1 - }, - "layer_norm_eps": 1e-05, - "length_penalty": 1, - "max_length": 20, - "min_length": 0, - "model_type": "clip_vision_model", - "no_repeat_ngram_size": 0, - "num_attention_heads": 12, - "num_beam_groups": 1, - "num_beams": 1, - "num_channels": 3, - "num_hidden_layers": 12, - "num_return_sequences": 1, - "output_attentions": false, - "output_hidden_states": false, - "output_scores": false, - "pad_token_id": null, - "patch_size": 32, - "prefix": null, - "problem_type": null, - "projection_dim": 512, - "pruned_heads": {}, - "remove_invalid_values": false, - "repetition_penalty": 1, - "return_dict": true, - "return_dict_in_generate": false, - "sep_token_id": null, - "suppress_tokens": null, - "task_specific_params": null, - "temperature": 1, - "tf_legacy_loss": false, - "tie_encoder_decoder": false, - "tie_word_embeddings": true, - "tokenizer_class": null, - "top_k": 50, - "top_p": 1, - "torch_dtype": null, - "torchscript": false, - "transformers_version": "4.30.2", - "typical_p": 1, - "use_bfloat16": false - } - } - } - }, - "version": "1.23.9" -} -# END OutputGetMeta -""" - - -client.close() - - -# TryFinallyCloseDemo -import weaviate -import os - -# END TryFinallyCloseDemo - -client = weaviate.connect_to_local() - -# TryFinallyCloseDemo -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_local() - -try: - # Work with the client here - e.g.: - assert client.is_live() - pass - -finally: # This will always be executed, even if an exception is raised - client.close() # Close the connection & release resources -# END TryFinallyCloseDemo diff --git a/docs/academy/py/starter_multimodal_data/_snippets/102_collection.py b/docs/academy/py/starter_multimodal_data/_snippets/102_collection.py deleted file mode 100644 index 290fc8780..000000000 --- a/docs/academy/py/starter_multimodal_data/_snippets/102_collection.py +++ /dev/null @@ -1,146 +0,0 @@ -# CreateMovieCollection -import weaviate - -# CreateMovieCollection # SubmoduleImport -import weaviate.classes.config as wc - -# CreateMovieCollection # END SubmoduleImport - -# END CreateMovieCollection -client = weaviate.connect_to_local( - port=8280, - grpc_port=50251 -) - -# CreateMovieCollection -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_local(headers=headers) - -# END CreateMovieCollection - -# Actual instantiation - -client.collections.delete("MovieMM") - -# CreateMovieCollection -client.collections.create( - name="MovieMM", # The name of the collection ('MM' for multimodal) - properties=[ - wc.Property(name="title", data_type=wc.DataType.TEXT), - wc.Property(name="overview", data_type=wc.DataType.TEXT), - wc.Property(name="vote_average", data_type=wc.DataType.NUMBER), - wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY), - wc.Property(name="release_date", data_type=wc.DataType.DATE), - wc.Property(name="tmdb_id", data_type=wc.DataType.INT), - wc.Property(name="poster", data_type=wc.DataType.BLOB), - ], - # Define & configure the vectorizer module - vector_config=wc.Configure.Vectors.multi2vec_clip( - image_fields=[wc.Multi2VecField(name="poster", weight=0.9)], # 90% of the vector is from the poster - text_fields=[wc.Multi2VecField(name="title", weight=0.1)], # 10% of the vector is from the title - ), - # Define the generative module - generative_config=wc.Configure.Generative.openai() - # END generativeDefinition # CreateMovieCollection -) - -client.close() -# END CreateMovieCollection - - -# BatchImportData -import weaviate -import pandas as pd -import requests -from datetime import datetime, timezone -import json -from weaviate.util import generate_uuid5 -from tqdm import tqdm -import os -import zipfile -from pathlib import Path -import base64 - -# END BatchImportData -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} -client = weaviate.connect_to_local( - port=8280, - grpc_port=50251, - headers=headers -) - -# BatchImportData -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_local() - -# END BatchImportData - -# BatchImportData -data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -resp = requests.get(data_url) -df = pd.DataFrame(resp.json()) - -# Create a directory for the images -img_dir = Path("scratch/imgs") -img_dir.mkdir(parents=True, exist_ok=True) - -# Download images -posters_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024_posters.zip" -posters_path = img_dir / "movies_data_1990_2024_posters.zip" -posters_path.write_bytes(requests.get(posters_url).content) - -# Unzip the images -with zipfile.ZipFile(posters_path, 'r') as zip_ref: - zip_ref.extractall(img_dir) - -# Get the collection -movies = client.collections.use("MovieMM") - -# END BatchImportData - -df = df[:50] # Limit to 50 for testing purposes - -# BatchImportData -# Enter context manager -with movies.batch.fixed_size(50) as batch: - # Loop through the data - for i, movie in tqdm(df.iterrows()): - # Convert data types - # Convert a JSON date to `datetime` and add time zone information - release_date = datetime.strptime(movie["release_date"], "%Y-%m-%d").replace( - tzinfo=timezone.utc - ) - # Convert a JSON array to a list of integers - genre_ids = json.loads(movie["genre_ids"]) - # Convert image to base64 - img_path = (img_dir / f"{movie['id']}_poster.jpg") - with open(img_path, "rb") as file: - poster_b64 = base64.b64encode(file.read()).decode("utf-8") - - # Build the object payload - movie_obj = { - "title": movie["title"], - "overview": movie["overview"], - "vote_average": movie["vote_average"], - "genre_ids": genre_ids, - "release_date": release_date, - "tmdb_id": movie["id"], - "poster": poster_b64, - } - - # Add object to batch queue - batch.add_object( - properties=movie_obj, - uuid=generate_uuid5(movie["id"]) - # references=reference_obj # You can add references here - ) - # Batcher automatically sends batches - -# Check for failed objects -if len(movies.batch.failed_objects) > 0: - print(f"Failed to import {len(movies.batch.failed_objects)} objects") - for failed in movies.batch.failed_objects: - print(f"e.g. Failed to import object with error: {failed.message}") - -client.close() diff --git a/docs/academy/py/starter_multimodal_data/_snippets/103_searches.py b/docs/academy/py/starter_multimodal_data/_snippets/103_searches.py deleted file mode 100644 index 1c5884f23..000000000 --- a/docs/academy/py/starter_multimodal_data/_snippets/103_searches.py +++ /dev/null @@ -1,180 +0,0 @@ -# START-ANY -import weaviate -import weaviate.classes.query as wq -import os - -# END-ANY - -# FilteredSemanticSearch -from datetime import datetime - -# END FilteredSemanticSearch - -# START-ANY - -# END-ANY - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} -client = weaviate.connect_to_local( - port=8280, - grpc_port=50251, - headers=headers -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_local(headers=headers) - -# END-ANY - - -# MetadataMultimodalSearch - -def url_to_base64(url): - import requests - import base64 - - image_response = requests.get(url) - content = image_response.content - return base64.b64encode(content).decode("utf-8") - - -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -src_img_path = "https://github.com/weaviate-tutorials/edu-datasets/blob/main/img/International_Space_Station_after_undocking_of_STS-132.jpg?raw=true" -query_b64 = url_to_base64(src_img_path) - -response = movies.query.near_image( - near_image=query_b64, - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - return_properties=["title", "release_date", "tmdb_id", "poster"] # To include the poster property in the response (`blob` properties are not returned by default) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year, o.properties["tmdb_id"] - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END MetadataMultimodalSearch - - -print("\n\n") - -client.connect() - - -# MetadataSemanticSearch -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -response = movies.query.near_text( - query="red", - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - return_properties=["title", "release_date", "tmdb_id", "poster"] # To include the poster property in the response (`blob` properties are not returned by default) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year, o.properties["tmdb_id"] - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END MetadataSemanticSearch - - -print("\n\n") - -client.connect() - -# MetadataBM25Search -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -response = movies.query.bm25( - query="history", limit=5, return_metadata=wq.MetadataQuery(score=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"BM25 score: {o.metadata.score:.3f}\n" - ) # Print the BM25 score of the object from the query - -client.close() -# END MetadataBM25Search - - -print("\n\n") - -client.connect() - -# MetadataHybridSearch -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -response = movies.query.hybrid( - query="history", limit=5, return_metadata=wq.MetadataQuery(score=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Hybrid score: {o.metadata.score:.3f}\n" - ) # Print the hybrid search score of the object from the query - -client.close() -# END MetadataHybridSearch - - -print("\n\n") - -client.connect() - -# FilteredSemanticSearch -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -response = movies.query.near_text( - query="dystopian future", - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - # highlight-start - filters=wq.Filter.by_property("release_date").greater_than(datetime(2020, 1, 1)) - # highlight-end -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END FilteredSemanticSearch diff --git a/docs/academy/py/starter_multimodal_data/_snippets/104_rag.py b/docs/academy/py/starter_multimodal_data/_snippets/104_rag.py deleted file mode 100644 index 68de4b4c4..000000000 --- a/docs/academy/py/starter_multimodal_data/_snippets/104_rag.py +++ /dev/null @@ -1,89 +0,0 @@ -# START-ANY -import os -import weaviate -import os - -# END-ANY - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} -client = weaviate.connect_to_local( - port=8280, - grpc_port=50251, - headers=headers -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_local(headers=headers) - - -def url_to_base64(url): - import requests - import base64 - - image_response = requests.get(url) - content = image_response.content - return base64.b64encode(content).decode("utf-8") - - -# END-ANY - -# SinglePromptGeneration -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -src_img_path = "https://github.com/weaviate-tutorials/edu-datasets/blob/main/img/International_Space_Station_after_undocking_of_STS-132.jpg?raw=true" -query_b64 = url_to_base64(src_img_path) - -response = movies.generate.near_image( - near_image=query_b64, - limit=5, - # highlight-start - single_prompt="Translate this into French: {title}" - # highlight-end -) - -# Inspect the response -for o in response.objects: - # highlight-start - print(o.properties["title"]) # Print the title - # highlight-end - print(o.generated) # Print the generated text (the title, in French) - -client.close() -# END SinglePromptGeneration - - -print("\n\n") - -client.connect() - - -# GroupedTaskGeneration -# Get the collection -movies = client.collections.use("MovieMM") - -# Perform query -src_img_path = "https://github.com/weaviate-tutorials/edu-datasets/blob/main/img/International_Space_Station_after_undocking_of_STS-132.jpg?raw=true" -query_b64 = url_to_base64(src_img_path) - -response = movies.generate.near_image( - near_image=query_b64, - limit=5, - # highlight-start - grouped_task="What do these movies have in common?", - grouped_properties=["title", "overview"] # Optional parameter; for reducing prompt length - # highlight-end -) - -# Inspect the response -for o in response.objects: - print(o.properties["title"]) # Print the title -# highlight-start -print(response.generative.text) # Print the generated text (the commonalities between them) -# highlight-end - -client.close() -# END GroupedTaskGeneration diff --git a/docs/academy/py/starter_multimodal_data/index.md b/docs/academy/py/starter_multimodal_data/index.md deleted file mode 100644 index 55fc03bf1..000000000 --- a/docs/academy/py/starter_multimodal_data/index.md +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: "101M Work with: Multimodal data" -description: Learn to handle multimodal data in Weaviate for diverse data integrations. -sidebar_position: 102 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -In this project-based course, you will learn how to work with multimodal data using Weaviate and a movie dataset. - -You will get hands-on experience on how to store and index text and image data to be searchable together by meaning, using Weaviate's vectorization capabilities. You will learn how to search through that data using multimodal search methods, as well as filters. You will also learn how to use Weaviate's retrieval augmented generation (RAG) capabilities to generate outputs based on the retrieved objects. - -## Learning objectives - - - -## Units - - - diff --git a/docs/academy/py/starter_text_data/101_setup_weaviate/10_client.mdx b/docs/academy/py/starter_text_data/101_setup_weaviate/10_client.mdx deleted file mode 100644 index 98b35da1d..000000000 --- a/docs/academy/py/starter_text_data/101_setup_weaviate/10_client.mdx +++ /dev/null @@ -1,36 +0,0 @@ ---- -title: Weaviate Python client -description: Client Setup for Text Data in Weaviate ---- - -## Installation - -The latest Weaviate Python client library can be installed using pip. The client library is tested on Python 3.8 and later. Install it using the following command: - -```bash -pip install -U weaviate-client -``` - -The latest major version is `v4` (e.g. `4.x.x`). You can check the version like so: - -```bash -pip show weaviate-client -``` - -## Basic usage - -From Python, you can load the Weaviate client library like so: - -```python -import weaviate -``` - -The client provides sets of helper classes (e.g. under `weaviate.classes`) and functions to make it easier to interact with Weaviate. - -Next, we'll show you how to create a Weaviate instance and connect to it. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx b/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx deleted file mode 100644 index 7a0fa3d0b..000000000 --- a/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/10_create_wcs.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: "Option 1: A cloud WCD instance" -description: Creating Weaviate Cloud Instance for Text Data ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../../_snippets/101_connect.py'; - -Here, you will create a Weaviate Cloud (WCD) instance. WCD is a fully managed Weaviate instance that runs in the cloud. It's a great way to get started with Weaviate, as it requires no installation or maintenance. - -### Log in to the WCD Console - -Go to the [WCD Console](https://console.weaviate.cloud/) and log in with your credentials. If you don't have an account yet, you can sign up by clicking on the Register here link from the login screen. - -### Create a Weaviate instance - -From the console, go to the Dashboard and click on the Create cluster button. From the following screen: - -- Select the "Free sandbox" tab -- Provide a cluster name -- Set "Enable authentication" to "Yes" - -Click on the Create button to create your Weaviate instance. The process will take a few minutes. - -### Retrieve your Weaviate instance details - -Once the instance is created, you will be able see its details by clicking on the Details button. Find the cluster URL and the API key. - -You will need these details to connect to your Weaviate instance. - -### Connect to your WCD instance - -To connect to the Weaviate Cloud (WCD) instance, you need to use the cluster URL and the API key. You can find these details in the WCD Console. - -Use the `connect_to_weaviate_cloud` function to connect to your WCD instance. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses OpenAI, so you can provide the OpenAI API key to Weaviate through `headers={"X-OpenAI-Api-Key": }` as shown below: - - - -:::note What next? -If you have completed this, you can skip the next page [Option 2: A local Weaviate instance](./20_create_docker.mdx) and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx b/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx deleted file mode 100644 index 964a98a3d..000000000 --- a/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/20_create_docker.mdx +++ /dev/null @@ -1,90 +0,0 @@ ---- -title: "Option 2: A local Docker instance" -description: Creating Docker Instance for Text Data ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../../_snippets/101_connect.py'; - -:::note Have you already created a Weaviate instance? -If you have [created a cloud instance](./10_create_wcs.mdx) of Weaviate, you can skip this page and continue with [Communicate with Weaviate](../30_communicate.mdx). -::: - -Here, you will create a Weaviate instance using Docker. - -### Download and run the docker-compose file - -Install Docker on your machine. We recommend following the [official Docker installation guide](https://docs.docker.com/get-docker/). - -Create a new directory and navigate to it in your terminal. Then, create a new file called `docker-compose.yml` and add the following content: - -```yaml ---- -services: - weaviate_anon: - command: - - --host - - 0.0.0.0 - - --port - - '8080' - - --scheme - - http - image: cr.weaviate.io/semitechnologies/weaviate:||site.weaviate_version|| - ports: - - 8080:8080 - - 50051:50051 - restart: on-failure:0 - environment: - QUERY_DEFAULTS_LIMIT: 25 - AUTHENTICATION_ANONYMOUS_ACCESS_ENABLED: 'true' - PERSISTENCE_DATA_PATH: '/var/lib/weaviate' - ENABLE_API_BASED_MODULES: 'true' - BACKUP_FILESYSTEM_PATH: '/var/lib/weaviate/backups' - CLUSTER_HOSTNAME: 'node1' -... - -``` - -### Create a Weaviate instance - -Run the following command to start Weaviate: - -```bash -docker compose up -``` - -### Your Weaviate instance details - -Once the instance is created, you can access it at `http://localhost:8080`. - -### Connect to your Weaviate instance - -To connect to the Weaviate instance, use the `connect_to_local` function. - - - -#### Provide inference API keys - -Some Weaviate modules can use inference APIs for vectorizing data or large language model integration. You can provide the API keys for these services to Weaviate at instantiation. - -This course uses OpenAI, so you can provide the OpenAI API key to Weaviate through `headers={"X-OpenAI-Api-Key": }` as shown below: - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/index.mdx b/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/index.mdx deleted file mode 100644 index 939d42813..000000000 --- a/docs/academy/py/starter_text_data/101_setup_weaviate/20_create_instance/index.mdx +++ /dev/null @@ -1,19 +0,0 @@ ---- -title: Create a Weaviate instance -description: Creating a Text Data Instance in Weaviate ---- - -For this unit, you can choose to create a Weaviate Cloud (WCD) instance or a local Docker instance. - -- [Create a Weaviate Cloud (WCD) instance](./10_create_wcs.mdx) - - If you want a managed service and don't want to worry about installation and maintenance. -- [Create a local Docker instance](./20_create_docker.mdx) - - If you want to run Weaviate on your local machine, or want to have full control over the installation and maintenance. - -Either option is fine for this course. If you're not sure which to choose, we recommend starting with a WCD instance. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/101_setup_weaviate/30_communicate.mdx b/docs/academy/py/starter_text_data/101_setup_weaviate/30_communicate.mdx deleted file mode 100644 index bf0087122..000000000 --- a/docs/academy/py/starter_text_data/101_setup_weaviate/30_communicate.mdx +++ /dev/null @@ -1,65 +0,0 @@ ---- -title: Communicate with Weaviate -description: Communication Setup for Text Data in Weaviate ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/101_connect.py'; - -Here, we'll perform basic operations to communicate with Weaviate using the Python client library. - -### Check Weaviate status - -You can check whether the Weaviate instance is up using the `is_live` function. - - - -### Retrieve server meta information - -You can retrieve meta information about the Weaviate instance using the `meta` function. - - - -This will print the server meta information to the console. The output will look similar to the following: - -
- Example get_meta output - - -
- -### Close the connection - -After you have finished using the Weaviate client, you should close the connection. This frees up resources and ensures that the connection is properly closed. - -We suggest using a `try`-`finally` block as a best practice. For brevity, we will not include the `try`-`finally` blocks in the remaining code snippets. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/101_setup_weaviate/index.mdx b/docs/academy/py/starter_text_data/101_setup_weaviate/index.mdx deleted file mode 100644 index 390568202..000000000 --- a/docs/academy/py/starter_text_data/101_setup_weaviate/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Set up Weaviate -description: Step-by-step guide to setting up Weaviate with text data management. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/102_text_collections/10_preparation.mdx b/docs/academy/py/starter_text_data/102_text_collections/10_preparation.mdx deleted file mode 100644 index 20025c77d..000000000 --- a/docs/academy/py/starter_text_data/102_text_collections/10_preparation.mdx +++ /dev/null @@ -1,41 +0,0 @@ ---- -title: Preparation -description: Preparing Text Collections for Use ---- - -In this section you are going to populate your Weaviate instance with a movie dataset, using the OpenAI API to embed the text data. - -### Weaviate instance - -Make sure to have your Weaviate instance set up. You should have [created an instance](../101_setup_weaviate/20_create_instance/index.mdx) and be able to connect to it. - -### OpenAI key - -You will need an OpenAI API key to follow along. If you don't have one, go to the [OpenAI website](https://openai.com/) and sign up for an account and create an API key. - - - -### Source data - -We are going to use a movie dataset sourced from [TMDB](https://www.themoviedb.org/). The dataset can be found in this [GitHub repository](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json), and it contains bibliographic information on ~700 movies released between 1990 and 2024. - -
- See sample data - -| | backdrop_path | genre_ids | id | original_language | original_title | overview | popularity | poster_path | release_date | title | video | vote_average | vote_count | -|---:|:---------------------------------|:----------------|-----:|:--------------------|:----------------------------|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-------------:|:---------------------------------|:---------------|:----------------------------|:--------|---------------:|-------------:| -| 0 | /3Nn5BOM1EVw1IYrv6MsbOS6N1Ol.jpg | [14, 18, 10749] | 162 | en | Edward Scissorhands | A small suburban town receives a visit from a castaway unfinished science experiment named Edward. | 45.694 | /1RFIbuW9Z3eN9Oxw2KaQG5DfLmD.jpg | 1990-12-07 | Edward Scissorhands | False | 7.7 | 12305 | -| 1 | /sw7mordbZxgITU877yTpZCud90M.jpg | [18, 80] | 769 | en | GoodFellas | The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbourhood gangsters at an early age and climbs the ranks of a Mafia family under the guidance of Jimmy Conway. | 57.228 | /aKuFiU82s5ISJpGZp7YkIr3kCUd.jpg | 1990-09-12 | GoodFellas | False | 8.5 | 12106 | -| 2 | /6uLhSLXzB1ooJ3522ydrBZ2Hh0W.jpg | [35, 10751] | 771 | en | Home Alone | Eight-year-old Kevin McCallister makes the most of the situation after his family unwittingly leaves him behind when they go on Christmas vacation. But when a pair of bungling burglars set their sights on Kevin's house, the plucky kid stands ready to defend his territory. By planting booby traps galore, adorably mischievous Kevin stands his ground as his frantic mother attempts to race home before Christmas Day. | 3.538 | /onTSipZ8R3bliBdKfPtsDuHTdlL.jpg | 1990-11-16 | Home Alone | False | 7.4 | 10599 | -| 3 | /vKp3NvqBkcjHkCHSGi6EbcP7g4J.jpg | [12, 35, 878] | 196 | en | Back to the Future Part III | The final installment of the Back to the Future trilogy finds Marty digging the trusty DeLorean out of a mineshaft and looking for Doc in the Wild West of 1885. But when their time machine breaks down, the travelers are stranded in a land of spurs. More problems arise when Doc falls for pretty schoolteacher Clara Clayton, and Marty tangles with Buford Tannen. | 28.896 | /crzoVQnMzIrRfHtQw0tLBirNfVg.jpg | 1990-05-25 | Back to the Future Part III | False | 7.5 | 9918 | -| 4 | /3tuWpnCTe14zZZPt6sI1W9ByOXx.jpg | [35, 10749] | 114 | en | Pretty Woman | When a millionaire wheeler-dealer enters a business contract with a Hollywood hooker Vivian Ward, he loses his heart in the bargain. | 97.953 | /hVHUfT801LQATGd26VPzhorIYza.jpg | 1990-03-23 | Pretty Woman | False | 7.5 | 7671 | - -
- -Next, you will create a corresponding object collection and import the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/102_text_collections/20_create_collection.mdx b/docs/academy/py/starter_text_data/102_text_collections/20_create_collection.mdx deleted file mode 100644 index 1d040623d..000000000 --- a/docs/academy/py/starter_text_data/102_text_collections/20_create_collection.mdx +++ /dev/null @@ -1,87 +0,0 @@ ---- -title: Create a collection -description: Creating a Collection for Text Data ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -Weaviate stores data in "collections". A collection is a set of objects that share the same data structure. In our movie database, we might have a collection of movies, a collection of actors, and a collection of reviews. - -Here we will create a collection of movies. - -## Code - -This example creates a collection for the movie data: - - - -Each collection definition must have a name. Then, you can define additional parameters like we've done in this example. - -## Explain the code - -### Properties - -Properties are the object attributes that you want to store in the collection. Each property has a name and a data type. - -In our movie database, we have properties like `title`, `release_date` and `genre_ids`, with data types like `TEXT` (string), `DATE` (date), or `INT` (integer). It's also possible to have arrays of integers, like we have with `genre_ids`. - -#### Auto-schema - -Weaviate can automatically [infer the schema](/weaviate/config-refs/collections.mdx#auto-schema) from the data. However, it's a good practice to define the properties explicitly, for better control and to avoid surprises. - -### Vectorizer configuration - -If you do not specify the vector yourself, Weaviate will use a specified vectorizer to generate vector embeddings from your data. - -In this code example, we specify the `text2vec-openai` module with default options. - - - -### Generative configuration - -If you wish to use your collection with a generative model (e.g. a large language model), you must specify the generative module. - -In this code example, we specify the `openai` module (`generative-openai` is the full name) with default options. - - - -import MutableGenerativeConfig from '/_includes/mutable-generative-config.md'; - - - -### Python classes - -The code example makes use of classes such as `Property`, `DataType` and `Configure`. They are defined in the `weaviate.classes.config` submodule and are used to define the collection. - -For convenience, we import the submodule as `wc` and use classes from it. - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/102_text_collections/30_import_data.mdx b/docs/academy/py/starter_text_data/102_text_collections/30_import_data.mdx deleted file mode 100644 index 7af10d66e..000000000 --- a/docs/academy/py/starter_text_data/102_text_collections/30_import_data.mdx +++ /dev/null @@ -1,98 +0,0 @@ ---- -title: Import data -description: Importing Data to Text Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -## Code - -This example imports the movie data into our collection. - - - -The code: -- Loads the source data & gets the collection -- Enters a context manager with a batcher (`batch`) object -- Loops through the data and adds objects to the batcher -- Prints out any import errors - -## Explain the code - -### Preparation - -We use the requests library to load the data from the source, in this case a JSON file. The data is then converted to a Pandas DataFrame for easier manipulation. - -Then, we create a collection object (with `client.collections.get`) so we can interact with the collection. - -### Batch context manager - -The `batch` object is a context manager that allows you to add objects to the batcher. This is useful when you have a large amount of data to import, as it abstracts away the complexity of managing the batch size and when to send the batch. - - - -This example uses the `.fixed_size()` method to create a batcher which sets the number of objects per batch. There are also other batcher types, like `.rate_limit()` for specifying the number of objects per minute and `.dynamic()` to create a dynamic batcher, which automatically determines and updates the batch size during the import process. - -### Add data to the batcher - -#### Convert data types - -The data is converted from a string to the correct data types for Weaviate. For example, the `release_date` is converted to a datetime object, and the `genre_ids` are converted to a list of integers. - - - -#### Add objects to the batcher - -Then we loop through the data and add each object to the batcher. The `batch.add_object` method is used to add the object to the batcher, and the batcher will send the batch according to the specified batcher type. - - - -### Error handling - -Because a batch includes multiple objects, it's possible that some objects will fail to import. The batcher saves these errors. - -You can print out the errors to see what went wrong, and then decide how to handle them, such as by raising an exception. In this example, we simply print out the errors. - - - -Note that the list of errors is cleared when a new context manager is entered, so you must handle the errors before initializing a new batcher. - -## Where do the vectors come from? - -When the batcher sends the queue to Weaviate, the objects are added to the collection. In our case, the movie collection. - -Recall that the collection has a vectorizer module, and we do not specify vectors here. So Weaviate uses the specified vectorizer to generate vector embeddings from the data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/102_text_collections/index.mdx b/docs/academy/py/starter_text_data/102_text_collections/index.mdx deleted file mode 100644 index 97553fbdd..000000000 --- a/docs/academy/py/starter_text_data/102_text_collections/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Populate the database -description: Organize and manage text collections effectively in Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/103_text_searches/10_semantic.mdx b/docs/academy/py/starter_text_data/103_text_searches/10_semantic.mdx deleted file mode 100644 index e65191979..000000000 --- a/docs/academy/py/starter_text_data/103_text_searches/10_semantic.mdx +++ /dev/null @@ -1,67 +0,0 @@ ---- -title: Semantic search -description: Semantic Search in Text Data ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -With Weaviate, you can perform semantic searches to find similar items based on their meaning. This is done by comparing the vector embeddings of the items in the database. - -### Code - -This example finds entries in "Movie" based on their similarity to the query "dystopian future", and prints out the title and release year of the top 5 matches. - - - -## Explain the code - -The results are based on similarity of the vector embeddings between the query and the database object text. In this case, the embeddings are generated by the vectorizer module. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the vector distance to the query. - -
- Example results - -```text -In Time 2011 -Distance to query: 0.179 - -Gattaca 1997 -Distance to query: 0.180 - -I, Robot 2004 -Distance to query: 0.182 - -Mad Max: Fury Road 2015 -Distance to query: 0.190 - -The Maze Runner 2014 -Distance to query: 0.193 -``` - -
- -### Response object - -The returned object is an instance of a custom class. Its `objects` attribute is a list of search results, each object being an instance of another custom class. - -Each returned object will: -- Include all properties and its UUID by default except those with blob data types. -- Not include any other information (e.g. references, metadata, vectors.) by default. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/103_text_searches/20_keyword_hybrid.mdx b/docs/academy/py/starter_text_data/103_text_searches/20_keyword_hybrid.mdx deleted file mode 100644 index 5b1fa2fb2..000000000 --- a/docs/academy/py/starter_text_data/103_text_searches/20_keyword_hybrid.mdx +++ /dev/null @@ -1,105 +0,0 @@ ---- -title: Keyword & Hybrid search -description: Keyword Hybrid Searches in Text Collections ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -You can also perform keyword (BM25) searches to find items based on their keyword similarity, or hybrid searches that combine BM25 and semantic/vector searches. - -## Keyword search - -### Code - -This example finds entries in "Movie" with the highest keyword search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a keyword search score using what's called the [BM25f](https://en.wikipedia.org/wiki/Okapi_BM25) algorithm. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the `score`, which is the BM25 score of the result. - -
- Example results - -```text -American History X 1998 -BM25 score: 2.707 - -A Beautiful Mind 2001 -BM25 score: 1.896 - -Legends of the Fall 1994 -BM25 score: 1.663 - -Hacksaw Ridge 2016 -BM25 score: 1.554 - -Night at the Museum 2006 -BM25 score: 1.529 -``` - -
- - -## Hybrid search - -### Code - -This example finds entries in "Movie" with the highest hybrid search scores for the term "history", and prints out the title and release year of the top 5 matches. - - - -### Explain the code - -The results are based on a hybrid search score. A hybrid search blends results of BM25 and semantic/vector searches. - -The `limit` parameter here sets the maximum number of results to return. - -The `return_metadata` parameter takes an instance of the `MetadataQuery` class to set metadata to return in the search results. The current query returns the `score`, which is the hybrid score of the result. - -
- Example results - -```text -Legends of the Fall 1994 -Hybrid score: 0.016 - -Hacksaw Ridge 2016 -Hybrid score: 0.016 - -A Beautiful Mind 2001 -Hybrid score: 0.015 - -The Butterfly Effect 2004 -Hybrid score: 0.015 - -Night at the Museum 2006 -Hybrid score: 0.012 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/103_text_searches/30_filters.mdx b/docs/academy/py/starter_text_data/103_text_searches/30_filters.mdx deleted file mode 100644 index 5736ce57b..000000000 --- a/docs/academy/py/starter_text_data/103_text_searches/30_filters.mdx +++ /dev/null @@ -1,55 +0,0 @@ ---- -title: Filters -description: Filters for Text Searches ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/103_searches.py'; - -Filters can be used to precisely refine search results. You can filter by properties as well as metadata, and you can combine multiple filters with `and` or `or` conditions to further narrow down the results. - -### Code - -This example finds entries in "Movie" based on their similarity to the query "dystopian future", only from those released after 2020. It prints out the title and release year of the top 5 matches. - - - -## Explain the code - -This query is identical to [that shown earlier](./10_semantic.mdx) for semantic search, but with the addition of a filter. The `filters` parameter here takes an instance of the `Filter` class to set the filter conditions. The current query filters the results to only include those with a release year after 2010. - -
- Example results - -```text -Dune 2021 -Distance to query: 0.199 - -Tenet 2020 -Distance to query: 0.200 - -Mission: Impossible - Dead Reckoning Part One 2023 -Distance to query: 0.207 - -Onward 2020 -Distance to query: 0.214 - -Jurassic World Dominion 2022 -Distance to query: 0.216 -``` - -
- - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/103_text_searches/index.mdx b/docs/academy/py/starter_text_data/103_text_searches/index.mdx deleted file mode 100644 index ae5e2b8a1..000000000 --- a/docs/academy/py/starter_text_data/103_text_searches/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: Perform searches -description: Execute efficient text-based searches using Weaviate's Python client. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/104_text_rag/10_setup.mdx b/docs/academy/py/starter_text_data/104_text_rag/10_setup.mdx deleted file mode 100644 index cacd2510b..000000000 --- a/docs/academy/py/starter_text_data/104_text_rag/10_setup.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: "RAG: Overview" -description: Setting up Text RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/102_collection.py'; - -### Motivation - -Retrieval augmented generation (RAG) is a way to combine the best of both worlds: the retrieval capabilities of semantic search and the generation capabilities of AI models such as large language models. This allows you to retrieve objects from a Weaviate instance and then generate outputs based on the retrieved objects. - -### Setup - -When we created a collection, we specified the `generative_module` parameter as shown here. - - - -This selects a generative module that will be used to generate outputs based on the retrieved objects. In this case, we're using the `openai` module, and the `GPT` family of large language models. - -As we did before with the vectorizer module, you will require an API key from the provider of the generative module. In this case, you will need an API key from OpenAI. - -### RAG queries - -RAG queries are also called 'generative' queries in Weaviate. You can access these functions through the `generate` submodule of the collection object. - -Each generative query works in addition to the regular search query, and will perform a RAG query on each retrieved object. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/104_text_rag/20_single_prompt.mdx b/docs/academy/py/starter_text_data/104_text_rag/20_single_prompt.mdx deleted file mode 100644 index 3365e2e20..000000000 --- a/docs/academy/py/starter_text_data/104_text_rag/20_single_prompt.mdx +++ /dev/null @@ -1,57 +0,0 @@ ---- -title: "'Single prompt' generation" -description: Single Prompt for Text RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_rag.py'; - -A 'single prompt' generation will perform RAG queries on each retrieved object. This is useful when you want to transform each object separately, with the same prompt. - -### Code - -This example finds entries in "Movie" whose vector best matches the query vector (for "dystopian future"). Then, instructs the large language model to translate the title of each movie into French. - -Each of the results is then printed out to the console. - - - -## Explain the code - -You must pass on one or more properties to the `single_prompt` parameter through braces, as we've done here with `"... {title} ..."`. This will instruct Weaviate to pass on the `title` property from each retrieved object to the large language model. - -
- Example results - -```text -In Time -À temps -Looper -Boucleur -I, Robot -Je, Robot -The Matrix -La Matrice -Children of Men -Les enfants des hommes -``` - -
- -### Response object - -Each response object is similar to that from a regular search query, with an additional `generated` attribute. This attribute will contain the generated output for each object. - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/104_text_rag/30_grouped_task.mdx b/docs/academy/py/starter_text_data/104_text_rag/30_grouped_task.mdx deleted file mode 100644 index c24028393..000000000 --- a/docs/academy/py/starter_text_data/104_text_rag/30_grouped_task.mdx +++ /dev/null @@ -1,60 +0,0 @@ ---- -title: "'Grouped task' generation" -description: Grouped Task for Text RAG ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!../_snippets/104_rag.py'; - -A 'grouped task' generation will perform RAG queries on the set of retrieved objects. This is useful when you want to transform the set of objects as a whole, with one prompt. - -### Code - -This example finds entries in "Movie" whose vector best matches the query vector (for "dystopian future"). Then, instructs the large language model to find commonalities between them. - -Each of the results is then printed out to the console. - - - -## Explain the code - -For `grouped_task` queries, you simply pass on the prompt to the `grouped_task` parameter. This will instruct Weaviate to pass on the: -- text properties from all retrieved objects, and -- the prompt - -to the large language model. - -
- Example results - -```text -In Time -Looper -I, Robot -The Matrix -Children of Men -These movies all involve futuristic settings and explore themes related to the manipulation of time, technology, and the potential consequences of advancements in society. They also touch on issues such as inequality, control, and the impact of human actions on the future of humanity. -``` - -
- -### Optional parameters - -You can also pass on a list of properties to be used, as the `grouped_properties` parameter. This can be useful to reduce the amount of data passed on to the large language model and omit irrelevant properties. - -### Response object - -A RAG query with the `grouped_task` parameter will return a response with an additional `generated` attribute. This attribute will contain the generated output for the set of objects. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/104_text_rag/index.mdx b/docs/academy/py/starter_text_data/104_text_rag/index.mdx deleted file mode 100644 index 2097d571f..000000000 --- a/docs/academy/py/starter_text_data/104_text_rag/index.mdx +++ /dev/null @@ -1,24 +0,0 @@ ---- -title: LLMs and Weaviate (RAG) -description: Implement RAG for text data to improve retrieval accuracy in Weaviate. ---- - - - - - - -## Learning objectives - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/starter_text_data/900_next_steps.mdx b/docs/academy/py/starter_text_data/900_next_steps.mdx deleted file mode 100644 index aa424d650..000000000 --- a/docs/academy/py/starter_text_data/900_next_steps.mdx +++ /dev/null @@ -1,7 +0,0 @@ ---- -title: Next steps ---- - -import IntroNextSteps from '../_snippets/intro_next_steps.mdx'; - - diff --git a/docs/academy/py/starter_text_data/_snippets/101_connect.py b/docs/academy/py/starter_text_data/_snippets/101_connect.py deleted file mode 100644 index 82ca65e70..000000000 --- a/docs/academy/py/starter_text_data/_snippets/101_connect.py +++ /dev/null @@ -1,161 +0,0 @@ -# WCDInstantiation -import weaviate -from weaviate.classes.init import Auth -import os - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) -# END WCDInstantiation - -client.close() - -# WCDAPIKeyInstantiation -import weaviate -from weaviate.classes.init import Auth -import os - -headers = { - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") -} # Replace with your OpenAI API key - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) -# END WCDAPIKeyInstantiation - -client.close() - -# DockerInstantiation -import weaviate - -client = weaviate.connect_to_local() -# END DockerInstantiation - -# DockerAPIKeyInstantiation -import weaviate -import os - -headers = { - "X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY") -} # Replace with your OpenAI API key - -client = weaviate.connect_to_local(headers=headers) -# END DockerAPIKeyInstantiation - -# PollLiveness -assert client.is_live() # This will raise an exception if the client is not live -# END PollLiveness - - -# GetMeta -import json - -metainfo = client.get_meta() -print(json.dumps(metainfo, indent=2)) # Print the meta information in a readable format -# END GetMeta - - -""" -# OutputGetMeta -{ - "hostname": "http://[::]:8080", - "modules": { - "backup-gcs": { - "bucketName": "weaviate-wcs-prod-cust-europe-west2-workloads-backups", - "rootName": "8616b69e-f8d2-4547-ad92-70b9557591c0" - }, - "generative-aws": { - "documentationHref": "https://docs.aws.amazon.com/bedrock/latest/APIReference/welcome.html", - "name": "Generative Search - AWS" - }, - "generative-cohere": { - "documentationHref": "https://docs.cohere.com/reference/generate", - "name": "Generative Search - Cohere" - }, - "generative-openai": { - "documentationHref": "https://platform.openai.com/docs/api-reference/completions", - "name": "Generative Search - OpenAI" - }, - "generative-palm": { - "documentationHref": "https://cloud.google.com/vertex-ai/docs/generative-ai/chat/test-chat-prompts", - "name": "Generative Search - Google PaLM" - }, - "qna-openai": { - "documentationHref": "https://platform.openai.com/docs/api-reference/completions", - "name": "OpenAI Question & Answering Module" - }, - "ref2vec-centroid": {}, - "reranker-cohere": { - "documentationHref": "https://txt.cohere.com/rerank/", - "name": "Reranker - Cohere" - }, - "text2vec-aws": { - "documentationHref": "https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings", - "name": "AWS Module" - }, - "text2vec-cohere": { - "documentationHref": "https://docs.cohere.ai/embedding-wiki/", - "name": "Cohere Module" - }, - "text2vec-huggingface": { - "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task", - "name": "Hugging Face Module" - }, - "text2vec-jinaai": { - "documentationHref": "https://jina.ai/embeddings/", - "name": "JinaAI Module" - }, - "text2vec-openai": { - "documentationHref": "https://platform.openai.com/docs/guides/embeddings/what-are-embeddings", - "name": "OpenAI Module" - }, - "text2vec-palm": { - "documentationHref": "https://cloud.google.com/vertex-ai/docs/generative-ai/embeddings/get-text-embeddings", - "name": "Google PaLM Module" - } - }, - "version": "1.23.8" -} -# END OutputGetMeta -""" - - -client.close() - - -# TryFinallyCloseDemo -import weaviate -import os - -# END TryFinallyCloseDemo - -from weaviate.classes.init import Auth - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) - -# TryFinallyCloseDemo -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local(...) - -try: - # Work with the client here - e.g.: - assert client.is_live() - pass - -finally: # This will always be executed, even if an exception is raised - client.close() # Close the connection & release resources -# END TryFinallyCloseDemo diff --git a/docs/academy/py/starter_text_data/_snippets/102_collection.py b/docs/academy/py/starter_text_data/_snippets/102_collection.py deleted file mode 100644 index 77666c42f..000000000 --- a/docs/academy/py/starter_text_data/_snippets/102_collection.py +++ /dev/null @@ -1,128 +0,0 @@ -# CreateMovieCollection -import weaviate - -# CreateMovieCollection # SubmoduleImport -import weaviate.classes.config as wc -import os - -# CreateMovieCollection # END SubmoduleImport - -# END CreateMovieCollection - -from weaviate.classes.init import Auth - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key -) - -# CreateMovieCollection -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_weaviate_cloud(..., headers=headers) or -# client = weaviate.connect_to_local(..., headers=headers) - -# END CreateMovieCollection - -# Actual instantiation - -client.collections.delete("Movie") - -# CreateMovieCollection -client.collections.create( - name="Movie", - properties=[ - wc.Property(name="title", data_type=wc.DataType.TEXT), - wc.Property(name="overview", data_type=wc.DataType.TEXT), - wc.Property(name="vote_average", data_type=wc.DataType.NUMBER), - wc.Property(name="genre_ids", data_type=wc.DataType.INT_ARRAY), - wc.Property(name="release_date", data_type=wc.DataType.DATE), - wc.Property(name="tmdb_id", data_type=wc.DataType.INT), - ], - # Define the vectorizer module - vector_config=wc.Configure.Vectors.text2vec_openai(), - # Define the generative module - generative_config=wc.Configure.Generative.openai() - # END generativeDefinition # CreateMovieCollection -) - -client.close() -# END CreateMovieCollection - - -# BatchImportData -import weaviate -import pandas as pd -import requests -from datetime import datetime, timezone -import json -from weaviate.util import generate_uuid5 -from tqdm import tqdm -import os - -# END BatchImportData - -from weaviate.classes.init import Auth - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) - -# BatchImportData -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local(...) - -# END BatchImportData - -# BatchImportData -data_url = "https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/movies_data_1990_2024.json" -resp = requests.get(data_url) -df = pd.DataFrame(resp.json()) - -# Get the collection -movies = client.collections.use("Movie") - -# Enter context manager -with movies.batch.fixed_size(batch_size=200) as batch: - # Loop through the data - for i, movie in tqdm(df.iterrows()): - # Convert data types - # Convert a JSON date to `datetime` and add time zone information - release_date = datetime.strptime(movie["release_date"], "%Y-%m-%d").replace( - tzinfo=timezone.utc - ) - # Convert a JSON array to a list of integers - genre_ids = json.loads(movie["genre_ids"]) - - # Build the object payload - movie_obj = { - "title": movie["title"], - "overview": movie["overview"], - "vote_average": movie["vote_average"], - "genre_ids": genre_ids, - "release_date": release_date, - "tmdb_id": movie["id"], - } - - # Add object to batch queue - batch.add_object( - properties=movie_obj, - uuid=generate_uuid5(movie["id"]) - # references=reference_obj # You can add references here - ) - # Batcher automatically sends batches - -# Check for failed objects -if len(movies.batch.failed_objects) > 0: - print(f"Failed to import {len(movies.batch.failed_objects)} objects") - -client.close() diff --git a/docs/academy/py/starter_text_data/_snippets/103_searches.py b/docs/academy/py/starter_text_data/_snippets/103_searches.py deleted file mode 100644 index 7a32088d8..000000000 --- a/docs/academy/py/starter_text_data/_snippets/103_searches.py +++ /dev/null @@ -1,139 +0,0 @@ -# START-ANY -import weaviate -import weaviate.classes.query as wq -import os - -# END-ANY - -# FilteredSemanticSearch -from datetime import datetime - -# END FilteredSemanticSearch - -# START-ANY - -# END-ANY - -from weaviate.classes.init import Auth - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_weaviate_cloud(..., headers=headers) or -# client = weaviate.connect_to_local(..., headers=headers) - -# END-ANY - -# MetadataSemanticSearch -# Get the collection -movies = client.collections.use("Movie") - -# Perform query -response = movies.query.near_text( - query="dystopian future", limit=5, return_metadata=wq.MetadataQuery(distance=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END MetadataSemanticSearch - - -print("\n\n") - -client.connect() - -# MetadataBM25Search -# Get the collection -movies = client.collections.use("Movie") - -# Perform query -response = movies.query.bm25( - query="history", limit=5, return_metadata=wq.MetadataQuery(score=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"BM25 score: {o.metadata.score:.3f}\n" - ) # Print the BM25 score of the object from the query - -client.close() -# END MetadataBM25Search - - -print("\n\n") - -client.connect() - -# MetadataHybridSearch -# Get the collection -movies = client.collections.use("Movie") - -# Perform query -response = movies.query.hybrid( - query="history", limit=5, return_metadata=wq.MetadataQuery(score=True) -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Hybrid score: {o.metadata.score:.3f}\n" - ) # Print the hybrid search score of the object from the query - -client.close() -# END MetadataHybridSearch - - -print("\n\n") - -client.connect() - -# FilteredSemanticSearch -# Get the collection -movies = client.collections.use("Movie") - -# Perform query -response = movies.query.near_text( - query="dystopian future", - limit=5, - return_metadata=wq.MetadataQuery(distance=True), - # highlight-start - filters=wq.Filter.by_property("release_date").greater_than(datetime(2020, 1, 1)) - # highlight-end -) - -# Inspect the response -for o in response.objects: - print( - o.properties["title"], o.properties["release_date"].year - ) # Print the title and release year (note the release date is a datetime object) - print( - f"Distance to query: {o.metadata.distance:.3f}\n" - ) # Print the distance of the object from the query - -client.close() -# END FilteredSemanticSearch diff --git a/docs/academy/py/starter_text_data/_snippets/104_rag.py b/docs/academy/py/starter_text_data/_snippets/104_rag.py deleted file mode 100644 index b31adf951..000000000 --- a/docs/academy/py/starter_text_data/_snippets/104_rag.py +++ /dev/null @@ -1,78 +0,0 @@ -# START-ANY -import os -import weaviate - -# END-ANY - -from weaviate.classes.init import Auth - -headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} - -client = weaviate.connect_to_weaviate_cloud( - cluster_url=os.getenv("WEAVIATE_URL"), # Replace with your WCD URL - auth_credentials=Auth.api_key( - os.getenv("WEAVIATE_API_KEY") - ), # Replace with your WCD key - headers=headers, -) - -# START-ANY -# Instantiate your client (not shown). e.g.: -# headers = {"X-OpenAI-Api-Key": os.getenv("OPENAI_APIKEY")} # Replace with your OpenAI API key -# client = weaviate.connect_to_weaviate_cloud(..., headers=headers) or -# client = weaviate.connect_to_local(..., headers=headers) - -# END-ANY - -# SinglePromptGeneration -# Get the collection -movies = client.collections.use("Movie") - -# Perform query -response = movies.generate.near_text( - query="dystopian future", - limit=5, - # highlight-start - single_prompt="Translate this into French: {title}" - # highlight-end -) - -# Inspect the response -for o in response.objects: - # highlight-start - print(o.properties["title"]) # Print the title - # highlight-end - print(o.generated) # Print the generated text (the title, in French) - -client.close() -# END SinglePromptGeneration - - -print("\n\n") - -client.connect() - - -# GroupedTaskGeneration -# Get the collection -movies = client.collections.use("Movie") - -# Perform query -response = movies.generate.near_text( - query="dystopian future", - limit=5, - # highlight-start - grouped_task="What do these movies have in common?", - # grouped_properties=["title", "overview"] # Optional parameter; for reducing prompt length - # highlight-end -) - -# Inspect the response -for o in response.objects: - print(o.properties["title"]) # Print the title -# highlight-start -print(response.generative.text) # Print the generated text (the commonalities between them) -# highlight-end - -client.close() -# END GroupedTaskGeneration diff --git a/docs/academy/py/starter_text_data/index.md b/docs/academy/py/starter_text_data/index.md deleted file mode 100644 index 990fe8ec3..000000000 --- a/docs/academy/py/starter_text_data/index.md +++ /dev/null @@ -1,23 +0,0 @@ ---- -title: "101T Work with: Text data" -description: Get started with text data in Weaviate for robust search and analytics. -sidebar_position: 100 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -In this project-based course, you will learn how to work with any text data using Weaviate and a movie dataset. - -You will get hands-on experience on how to store and index text data by meaning, using Weaviate's vectorization capabilities. You will learn how to search through that data using semantic, keyword and hybrid searches, as well as filters. You will also learn how to use Weaviate's retrieval augmented generation (RAG) capabilities to generate outputs based on the retrieved objects. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/tmp_images/academy_placeholder.jpg b/docs/academy/py/tmp_images/academy_placeholder.jpg deleted file mode 100644 index 49a360a8b..000000000 Binary files a/docs/academy/py/tmp_images/academy_placeholder.jpg and /dev/null differ diff --git a/docs/academy/py/tokenization/100_basics.mdx b/docs/academy/py/tokenization/100_basics.mdx deleted file mode 100644 index ae2efb8bf..000000000 --- a/docs/academy/py/tokenization/100_basics.mdx +++ /dev/null @@ -1,56 +0,0 @@ ---- -title: Overview of tokenization ---- - -Tokenization is the process of breaking text into smaller units, called tokens. This is an important step that impacts how text is processed in a variety of contexts. - -Consider text like: - -```text -Ankh-Morpork's police captain -``` - -This text could be tokenized in a variety of ways. All of the following are perfectly valid tokenizations: - -1. `["Ankh-Morpork's", "police", "captain"]` -1. `["ankh", "morpork", "police", "captain"]` -1. `['An', '##kh', '-', 'Mo', '##rp', '##or', '##k', "'", 's', 'police', 'captain']` - -Methods 1 and 2 are examples of *word tokenization*, while method 3 is an example of *subword tokenization*. - -The choice of tokenization method will depend on the context in which the text is being used. - -### For keyword search & filtering - -The choice of tokenization method will significantly impact the result of keyword search and filtering. This can cause it to either meet or miss the user's expectations. - -In a database of television shows, you would expect a search for `"Superman"`, or `"Clark"` to include the show `"Lois & Clark: The New Adventures of Superman"`. Selecting the right tokenization method will ensure that this is the case. - -But, in a database of email addresses, you would not expect a search for `"john@example.com"` to include `"john.doe@example.com"`. In this case, your tokenization strategy might be different to above. - -And what about all the cases in between? Should a search for `"clark"`, or `"lois and clark"` include the show? That might depend on how you want the search to behave. - -Because of varying needs like these, Weaviate allows you to configure the tokenization method to suit your use case. From the next section onwards, we will discuss the different tokenization methods available in Weaviate, and how to configure them. - -### For language models - -Language models digest and work with the overall meaning of the text to embed or produce text. So, each token for a language model is designed to represent meaning. - -To balance the need for a manageable vocabulary size with the need to capture the meaning of the text, subword tokenization (method 3 above) is often used. This is a key part of the architecture of language models. - -At a user level, however, the choice of tokenization method is abstracted away. Because the tokenization method must be consistent between model development (training) and usage (inference), it is baked into the model. - -This means that as you use Weaviate to vectorize text, or perform retrieval augmented generation (RAG) tasks, you don't need to worry about the tokenization method. The chosen model will simply take care of this for you. - -As a result, this course does not go into detail on tokenization in the context of language models. - -:::info Interested in tokenization for language models? -This is a rich area of study. So if you would like to read more about tokenization in the context of language models, this [Hugging Face conceptual guide on the topic](https://huggingface.co/transformers/tokenizer_summary.html) is a great resource. Hugging Face also provides this guide on [using tokenizers](https://huggingface.co/learn/nlp-course/en/chapter2/4) in its `transformers` library. -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/tokenization/200_options.mdx b/docs/academy/py/tokenization/200_options.mdx deleted file mode 100644 index 559a75868..000000000 --- a/docs/academy/py/tokenization/200_options.mdx +++ /dev/null @@ -1,143 +0,0 @@ ---- -title: Available tokenization options ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - -Weaviate offers a variety of tokenization options to choose from. These options allow you to configure how keyword searches and filters are performed in Weaviate for each property. - -The main options are: - -- `word`: alphanumeric, lowercased tokens -- `lowercase`: lowercased tokens -- `whitespace`: whitespace-separated, case-sensitive tokens -- `field`: the entire value of the property is treated as a single token - -Let's explore each of these options in more detail, including how they work and when you might want to use them. - -## Tokenization methods - -### `word` - -The `word` tokenization method splits the text by any non-alphanumeric characters, and then lowercases each token. - -Here are some examples of how the `word` tokenization method works: - -| Text | Tokens | -| ---- | ------ | -| `"Why, hello there!"` | `["why", "hello", "there"]` | -| `"Lois & Clark: The New Adventures of Superman"` | `["lois", "clark", "the", "new", "adventures", "of", "superman"]` | -| `"variable_name"` | `["variable", "name"]` | -| `"Email: john.doe@example.com"` | `["email", "john", "doe", "example", "com"]` | - -#### When to use `word` tokenization - -The `word` tokenization is the default tokenization method in Weaviate. - -Generally, if you are searching or filtering "typical" text data, `word` tokenization is a good starting point. - -But if symbols (such as `&`, `@` or `_`) are important to your data and search, or distinguishing between different cases is important, you may want to consider using a different tokenization method such as `lowercase` or `whitespace`. - -### `lowercase` - -The `lowercase` tokenization method splits the text by whitespace, and then lowercases each token. - -Here are some examples of how the `lowercase` tokenization method works: - -| Text | Tokens | -| ---- | ------ | -| `"Why, hello there!"` | `["why,", "hello", "there!"]` | -| `"Lois & Clark: The New Adventures of Superman"` | `["lois", "&", "clark:", "the", "new", "adventures", "of", "superman"]` | -| `"variable_name"` | `["variable_name"]` | -| `"Email: john.doe@example.com"` | `["email:", "john.doe@example.com"]` | - -#### When to use `lowercase` tokenization - -The `lowercase` tokenization can be thought of as `word`, but including symbols. A key use case for `lowercase` is when symbols such as `&`, `@` or `_` are significant for your data. - -This might include cases where your database contains code snippets, email addresses, or any other symbolic notations with meaning. - -As an example, consider filtering for objects containing `"database_address"`: - -| Text | Tokenization | Matched by `"database_address"` | -| ---- | ------------ | ------- | -| `"database_address"` | `word` | ✅ | -| `"database_address"` | `lowercase` | ✅ | -| `"database_company_address"` | `word` | ✅ | -| `"database_company_address"` | `lowercase` | ❌ | - -Note how the filtering behavior changes. A careful choice of tokenization method can ensure that the search results meet your and the users' expectations. - -### `whitespace` - -The `whitespace` tokenization method splits the text by whitespace. - -Here are some examples of how the `whitespace` tokenization method works: - -| Text | Tokens | -| ---- | ------ | -| `"Why, hello there!"` | `["Why,", "hello", "there!"]` | -| `"Lois & Clark: The New Adventures of Superman"` | `["Lois", "&", "Clark:", "The", "New", "Adventures", "of", "Superman"]` | -| `"variable_name"` | `["variable_name"]` | -| `"Email: john.doe@example.com"` | `["Email:", "john.doe@example.com"]` | - -#### When to use `whitespace` tokenization - -The `whitespace` tokenization method adds case-sensitivity to `lowercase`. This is useful when your data distinguishes between cases, such as for names of entities or acronyms. - -A risk of using `whitespace` tokenization is that it can be too strict. For example, a search for `"superman"` will not match `"Superman"`, as the tokens are case-sensitive. - -But this could be managed on a case-by-case basis. It would be possible to construct queries that are case-insensitive, such as by having the query create two versions of the search term: one in lowercase and one in uppercase. - -On the other hand, it will not be possible to construct case-sensitive queries using `word` or `lowercase` tokenization. - -### `field` - -The `field` tokenization method simply treats the entire value of the property as a single token. - -Here are some examples of how the `field` tokenization method works: - -| Text | Tokens | -| ---- | ------ | -| `"Why, hello there!"` | `["Why, hello there!"]` | -| `"Lois & Clark: The New Adventures of Superman"` | `["Lois & Clark: The New Adventures of Superman"]` | -| `"variable_name"` | `["variable_name"]` | -| `"Email: john.doe@example.com"` | `["Email: john.doe@example.com"]` | - -#### When to use `field` tokenization - -The `field` tokenization is useful when exact matches of strings in the exact order are important. Typically, this is useful for properties that contain unique identifiers, such as email addresses, URLs, or other unique strings. - -Generally, `field` tokenization should be used judiciously due to its strictness. - -For keyword searches, `field` tokenization has limited use. A keyword search for `"computer mouse"` will not match `"mouse for a computer"`, nor will it match `"computer mouse pad"` or even `"a computer mouse"`. - -## Stop words - -Weaviate supports [stop words](https://en.wikipedia.org/wiki/Stop_word). Stop words are common words which are often filtered out from search queries because they occur frequently and do not carry much meaning. - -By default, Weaviate uses a [list of English stop words](https://github.com/weaviate/weaviate/blob/main/adapters/repos/db/inverted/stopwords/presets.go). You can [configure your own list of stop words](../../../weaviate/config-refs/indexing/inverted-index.mdx#stopwords) in the schema definition. - -This means that after tokenization, any stop words in the text behave as if they were not present. For example, a filter for `"a computer mouse"` will behave identically to a filter for `"computer mouse"`. - -## Language-specific tokenization - -The above tokenization methods work well for English, or other languages that use spaces to separate words. - -However, not all languages rely on spaces to define natural semantic boundaries. For languages like Japanese, Chinese or Korean, where words are not separated by spaces, you may need to use a different tokenization method. - -Weaviate provides `gse` and `trigram` (from `v1.24`) and `kagome_kr` (from `v1.25.7`) tokenization methods for this reason. - -`gse` implements the "Jieba" algorithm, which is a popular Chinese text segmentation algorithm. `trigram` splits text into all possible trigrams, which can be useful for languages like Japanese. - -`kagome_ja` uses the [`Kagome` tokenizer](https://github.com/ikawaha/kagome?tab=readme-ov-file) with a Japanese [MeCab IPA](https://github.com/ikawaha/kagome-dict/) dictionary to split Japanese property text. - -`kagome_kr` uses the [`Kagome` tokenizer](https://github.com/ikawaha/kagome?tab=readme-ov-file) with a Korean MeCab ([mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic/src/master/)) dictionary to split Korean property text. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/tokenization/300_filters.mdx b/docs/academy/py/tokenization/300_filters.mdx deleted file mode 100644 index 53ef8564c..000000000 --- a/docs/academy/py/tokenization/300_filters.mdx +++ /dev/null @@ -1,220 +0,0 @@ ---- -title: Tokenization and filters ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCreateCollection from '!!raw-loader!./_snippets/310_create_collection.py'; -import PyAddObjects from '!!raw-loader!./_snippets/315_add_objects.py'; -import PyFilters from '!!raw-loader!./_snippets/320_filters.py'; - -Now that you've learned about different tokenization methods, let's put them into practice. In this section, you'll see how tokenization impacts filters. - -## Preparation - -For this section, we'll work with an actual Weaviate instance to see how different tokenization methods impact filtering results. - -We are going to use a very small, custom dataset for demonstration purposes. - - - -To follow along, you can use the following Python code to add this data to your Weaviate instance. - -
- Steps to create a collection - -We will create a simple object collection, with each object containing multiple properties. Each properties will contain the same text, but with different tokenization methods applied. - - - -Note that we do not add object vectors in this case, as we are only interested in the impact of tokenization on filters (and keyword searches). - -
- -
- Steps to add objects - -Now, we add objects to the collection, repeating text objects as properties. - - - -
- - -## Impact on filters - -Now that we have added a set of objects to Weaviate, let's see how different tokenization methods impact filtered retrieval. - -Each filtered query will look something like this, wherein we filter the objects for a set of query strings. - -We'll set up a reusable function to filter objects based on a set of query strings. Remember that a filter is binary: it either matches or it doesn't. - -The function will return a list of matched objects, and print the matched objects for us to see. - - - -### "**Clark:** "vs "**clark**" - messy text - -Typical text is often messy, with punctuations, mixed cases and other irregularities. Take a look at this example, where we filter for various combinations of substrings from the TV show title `"Lois & Clark: The New Adventures of Superman"`. - -The table shows whether the query matched the title: - -| | `word` | `lowercase` | `whitespace` | `field` | -|---------------|--------|-------------|--------------|---------| -| `"clark"` | ✅ | ❌ | ❌ | ❌ | -| `"Clark"` | ✅ | ❌ | ❌ | ❌ | -| `"clark:" ` | ✅ | ✅ | ❌ | ❌ | -| `"Clark:" ` | ✅ | ✅ | ✅ | ❌ | -| `"lois clark"` | ✅ | ❌ | ❌ | ❌ | -| `"clark lois"` | ✅ | ❌ | ❌ | ❌ | - -
- Python query & output - - - - - -
- -Note how the `word` tokenization was the only that consistently returned the matching title, unless the colon (`:`) was included in the query. This is because the `word` tokenization method treats the colon as a separator. - -Users may not be expected to include any punctuation in their queries, nor the exact capitalization. As a result, for a typical text filter usage, the `word` tokenization method is a good starting point. - -### "**A mouse**" vs "**mouse**" - stop words - -Here, we filter for variants of the phrase "computer mouse", where some queries include additional words. - -Now, take a look at the results. - -**Matches for `"computer mouse"`** - -| | `word` | `lowercase` | `whitespace` | `field` | -|------------------------------|-----------|-------------|--------------|---------| -| `"computer mouse"` | ✅ | ✅ | ✅ | ✅ | -| `"a computer mouse"` | ✅ | ✅ | ✅ | ❌ | -| `"the computer mouse:" ` | ✅ | ✅ | ✅ | ❌ | -| `"blue computer mouse" ` | ❌ | ❌ | ❌ | ❌ | - -**Matches for `"a computer mouse"`** - -| | `word` | `lowercase` | `whitespace` | `field` | -|------------------------------|-----------|-------------|--------------|---------| -| `"computer mouse"` | ✅ | ✅ | ✅ | ❌ | -| `"a computer mouse"` | ✅ | ✅ | ✅ | ✅ | -| `"the computer mouse:" ` | ✅ | ✅ | ✅ | ❌ | -| `"blue computer mouse" ` | ❌ | ❌ | ❌ | ❌ | - -
- Python query & output - - - - - -
- -The results indicate that adding the word "a" or "the" to the query does not impact the filter results for all methods except `field`. This is because at every tokenization method, the word "a" or "the" is considered a stop word and is ignored. - -With the `field` method, the difference is that stop word tokens like "a" or "the" are never produced. An input "a computer mouse" is tokenized to `["a computer mouse"]`, containing one token. - -Adding another word, such as "blue", that is not a stop word, causes the query to not match any objects. - -### "**variable_name**" vs "**variable name**" - symbols - -The `word` tokenization is a good default. However, it may not always be the best choice. Take a look at this example where we filter for different variants of `"variable_name"`, to see if they match the object with the exact string (`"variable_name"`). - -| | `word` | `lowercase` | `whitespace` | `field` | -|------------------------------|-----------|-------------|--------------|---------| -| `"variable_name"` | ✅ | ✅ | ✅ | ✅ | -| `"Variable_Name:" ` | ✅ | ✅ | ❌ | ❌ | -| `"Variable Name:" ` | ✅ | ❌ | ❌ | ❌ | -| `"a_variable_name"` | ✅ | ❌ | ❌ | ❌ | -| `"the_variable_name"` | ✅ | ❌ | ❌ | ❌ | -| `"variable_new_name" ` | ✅ | ❌ | ❌ | ❌ | - -
- Python query & output - - - - - -
- -What is the desired behavior here? Should a filter for `"variable name"` match the object with the property `"variable_name"`? - -What about a filter for `"variable_new_name"`? If the goal is to look through, say, a code base, the user might not expect a filter for `"variable_new_name"` to match `"variable_name"`. - -In cases such as these, where symbols are important to your data, you should consider using a tokenization method that preserves symbols, such as `lowercase` or `whitespace`. - -## Discussions - -We've discussed how different tokenization methods impact filters. - -For most filtering use, the `word` tokenization method is a good starting point. It is case-insensitive, and treats most symbols as separators. - -However, if symbols are important to your data, or if you need to distinguish between different cases, you may want to consider using a different tokenization method. - -And what about `field` tokenization? This method is most useful when you have text that should be treated as a single token. This is useful for properties like email addresses, URLs, or identifiers. - -A typical filtering strategy with a `field` tokenization method might involve exact matches, or partial matches with wildcards. Do note, however, that wildcard-based filters can be computationally expensive (slow) - so use them judiciously. - -Next, we'll discuss how tokenization impacts keyword searches. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/tokenization/400_searches.mdx b/docs/academy/py/tokenization/400_searches.mdx deleted file mode 100644 index 07838bbe0..000000000 --- a/docs/academy/py/tokenization/400_searches.mdx +++ /dev/null @@ -1,195 +0,0 @@ ---- -title: Tokenization and searches ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PySearches from '!!raw-loader!./_snippets/400_searches.py'; - -You saw how [tokenization affects filters](./300_filters.mdx). They impact keyword searches in a similar, but not identical, way. In this section, we'll see how different tokenization methods impact search results. - -:::info What about hybrid searches? -A hybrid search combines results from a keyword search and a vector search. Accordingly, tokenization impacts the keyword search part of a hybrid search, while the vector search part is not impacted by tokenization. - -We will not separately discuss hybrid searches in this course. However, the impact on keyword searches discussed here will apply to the keyword search part of a hybrid search. -::: - -## Impact on keyword searches - -### How tokenization impacts keyword searches - -We will use a similar method as in the previous section, with a difference being that we will now perform a keyword search instead of a filter. - -A keyword search ranks results using the [BM25f algorithm](https://en.wikipedia.org/wiki/Okapi_BM25). As a result, the impact of tokenization on keyword searches is twofold. - -Firstly, tokenization will determine whether a result is included in the search results at all. If none of the tokens in the search query match any tokens in the object, the object will not be included in the search results. - -Secondly, tokenization will impact the ranking of the search results. The BM25f algorithm takes into account the number of matching tokens, and the tokenization method will determine which tokens are considered matching. - -### Search setup - -Each keyword query will look something like this. - -We'll set up a reusable function to perform keyword searches, and display the top results along with their scores. - - - -### Examples - -#### "**Clark:** "vs "**clark**" - messy text - -Keyword searches are similarly impacted by tokenization as filters. However, there are subtle differences. - -Take a look at this example, where we search for various combinations of substrings from the TV show title `"Lois & Clark: The New Adventures of Superman"`. - -The table shows whether the query matched the title, and the score: - -| | `word` | `lowercase` | `whitespace` | `field` | -|---------------|--------|-------------|--------------|---------| -| `"clark"` | 0.613 | ❌ | ❌ | ❌ | -| `"Clark"` | 0.613 | ❌ | ❌ | ❌ | -| `"clark:" ` | 0.613 | 0.48 | ❌ | ❌ | -| `"Clark:" ` | 0.613 | 0.48 | 0.48 | ❌ | -| `"lois clark"` | 1.226 | 0.48 | ❌ | ❌ | -| `"clark lois"` | 1.226 | 0.48 | ❌ | ❌ | - -
- Python query & output - - - - - -
- -Here, the same results are returned as in the filter example. However, note differences in the scores. - -For example, a search for `"lois clark"` returns a higher score than a search for `"clark"`. This is because the BM25f algorithm considers the number of matching tokens. So, it would be beneficial to include as many matching tokens as possible in the search query. - -Another difference is that a keyword search will return objects that match any of the tokens in the query. This is different from a filter, which is sensitive to the filtering operator. Depending on the desired result, you could use an `"Equal"` operator, `"ContainsAny"`, or `"ContainsAll"`, for example. - -The next section will demonstrate this, as well as how stop words are treated. - -#### "**A mouse**" vs "**mouse**" - stop words - -Here, we search for variants of the phrase "computer mouse", where some queries include additional words. - -Now, take a look at the results. - -**Matches for `"computer mouse"`** - -| | `word` | `lowercase` | `whitespace` | `field` | -|------------------------------|-----------|-------------|--------------|---------| -| `"computer mouse"` | 0.889 | 0.819 | 1.01 | 0.982 | -| `"Computer Mouse"` | 0.889 | 0.819 | ❌ | ❌ | -| `"a computer mouse"` | 0.764 | 0.764 | 0.849 | ❌ | -| `"computer mouse pad" ` | 0.764 | 0.764 | 0.849 | ❌ | - -**Matches for `"a computer mouse"`** - -| | `word` | `lowercase` | `whitespace` | `field` | -|------------------------------|-----------|-------------|--------------|---------| -| `"computer mouse"` | 0.889 | 0.819 | 1.01 | ❌ | -| `"Computer Mouse"` | 0.889 | 0.819 | ❌ | ❌ | -| `"a computer mouse"` | 0.764 | 1.552 | 1.712 | 0.982 | -| `"computer mouse pad" ` | 0.764 | 0.688 | 0.849 | ❌ | - -
- Python query & output - - - - - -
- -The results here are similar to the filter example, but more nuanced and quite interesting! - -Under `word` tokenization, the search for `computer mouse` produces identical results to the search for `a computer mouse`. This is because the stop word `a` is not considered in the search. - -But note that the scores are different for returned objects where the only differences are stopwords, such as `"computer mouse"` and `"a computer mouse"`. This is because the BM25f algorithm does [index stopwords](../../../weaviate/config-refs/indexing/inverted-index.mdx#stopwords), and they do impact the score. - -As a user, you should keep this in mind, and you can configure the stop words in the collection definition to suit your desired behavior. - -Another interesting note is that the `lowercase` and `whitespace` tokenization methods do not remove stop words in the query. - -This behavior allows users who want to include stop words in their search queries to do so. - -#### "**variable_name**" vs "**variable name**" - symbols - -The table below shows keyword search results using the string `"variable_name"` and the resulting scores. - -| | `word` | `lowercase` | `whitespace` | `field` | -|------------------------------|-----------|-------------|--------------|---------| -| `"variable_name"` | 0.716 | 0.97 | 1.27 | 0.982 | -| `"Variable_Name:" ` | 0.716 | 0.97 | ❌ | ❌ | -| `"Variable Name:" ` | 0.716 | ❌ | ❌ | ❌ | -| `"a_variable_name"` | 0.615 | ❌ | ❌ | ❌ | -| `"the_variable_name"` | 0.615 | ❌ | ❌ | ❌ | -| `"variable_new_name" ` | 0.615 | ❌ | ❌ | ❌ | - -
- Python query & output - - - - - -
- -These results are once again similar to the filter example. If your data contains symbols that are important to your search, you should consider using a tokenization method that preserves symbols, such as `lowercase` or `whitespace`. - -### Discussions - -That's it for keyword searches and tokenization. Similarly to filters, the choice of tokenization method is a big part of your overall search strategy. - -Our generally advice for tokenization in keyword searching is similar to [our advice for filtering](./300_filters.mdx#-discussions). Start with `word`, and consider others such as `lowercase` or `whitespace` if symbols, or cases encode important information in your data. - -Using `field` tokenization may be too strict for keyword searches, as it will not match any -objects that do not contain the exact string in the exact order. - -Lastly, keep in mind that keyword searches produce ranked results. Therefore, tokenization will not only affect the results set but also their ranking within the set. - -With these considerations in mind, you can configure your tokenization strategy to best suit your data and your users' needs. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/tokenization/900_next_steps.mdx b/docs/academy/py/tokenization/900_next_steps.mdx deleted file mode 100644 index 0ad72fd08..000000000 --- a/docs/academy/py/tokenization/900_next_steps.mdx +++ /dev/null @@ -1,30 +0,0 @@ ---- -title: Next steps ---- - -Congratulations! You have completed this course on tokenization for the inverted index. We hope you found it helpful and informative. - -There are many more resources available to help you continue your learning journey. - -## Documentation - -- [Refereces: Configuration: Tokenization](/weaviate/config-refs/collections.mdx#tokenization) -- [Refereces: Configuration: Stopwords](/weaviate/config-refs/indexing/inverted-index.mdx#stopwords) -- [Concepts: Inverted index](/weaviate/concepts/indexing/inverted-index.md) -- [Concepts: Filtering](/weaviate/concepts/filtering.md) - -:::note -As a reminder, for non-English texts, especially those which do not rely on spaces between words, try the `trigram` or `gse` tokenization methods which were added in Weaviate `v1.24` for such cases. -::: - -import CTASocials from '../_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/tokenization/_snippets/310_create_collection.py b/docs/academy/py/tokenization/_snippets/310_create_collection.py deleted file mode 100644 index d5d616271..000000000 --- a/docs/academy/py/tokenization/_snippets/310_create_collection.py +++ /dev/null @@ -1,44 +0,0 @@ -# CreateDemoCollection -import weaviate -from weaviate.classes.config import Property, DataType, Tokenization, Configure - -# END CreateDemoCollection - -client = weaviate.connect_to_local() - -# CreateDemoCollection -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local() - -# END CreateDemoCollection - -client.collections.delete("TokenizationDemo") - -# CreateDemoCollection -tkn_options = [ - Tokenization.WORD, - Tokenization.LOWERCASE, - Tokenization.WHITESPACE, - Tokenization.FIELD, -] - -# Create a property for each tokenization option -properties = [] -for tokenization in tkn_options: - prop = Property( - name=f"text_{tokenization.replace('.', '_')}", - data_type=DataType.TEXT, - tokenization=tokenization - ) - properties.append(prop) - - -client.collections.create( - name="TokenizationDemo", - properties=properties, - vector_config=Configure.Vectors.self_provided() -) - -client.close() -# END CreateDemoCollection diff --git a/docs/academy/py/tokenization/_snippets/315_add_objects.py b/docs/academy/py/tokenization/_snippets/315_add_objects.py deleted file mode 100644 index a1da9155d..000000000 --- a/docs/academy/py/tokenization/_snippets/315_add_objects.py +++ /dev/null @@ -1,55 +0,0 @@ -# AddObjects -import weaviate - -# END AddObjects - -client = weaviate.connect_to_local() - -# AddObjects -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local() - -# END AddObjects - -# AddObjects # StringsToAdd -collection = client.collections.use("TokenizationDemo") - -# END AddObjects # END StringsToAdd -# AddObjects -# Get property names -property_names = [p.name for p in collection.config.get().properties] - -# AddObjects # StringsToAdd -phrases = [ - # string with special characters - "Lois & Clark: The New Adventures of Superman", - - # strings with stopwords & varying orders - "computer mouse", - "Computer Mouse", - "mouse computer", - "computer mouse pad", - "a computer mouse", - - # strings without spaces - "variable_name", - "Variable_Name", - "Variable Name", - "a_variable_name", - "the_variable_name", - "variable_new_name", -] -# END AddObjects # END StringsToAdd - -# AddObjects - -for phrase in phrases: - obj_properties = {} - for property_name in property_names: - obj_properties[property_name] = phrase - print(obj_properties) - collection.data.insert(properties=obj_properties) - -client.close() -# END AddObjects diff --git a/docs/academy/py/tokenization/_snippets/320_filters.py b/docs/academy/py/tokenization/_snippets/320_filters.py deleted file mode 100644 index f07a67656..000000000 --- a/docs/academy/py/tokenization/_snippets/320_filters.py +++ /dev/null @@ -1,210 +0,0 @@ -# FilterExampleBasic -import weaviate -from weaviate.classes.query import Filter -from weaviate.collections import Collection -from typing import List - -# END FilterExampleBasic - -client = weaviate.connect_to_local() - -# FilterExampleBasic -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local() - -# END FilterExampleBasic - -# FilterExampleBasic -collection = client.collections.use("TokenizationDemo") - -# END FilterExampleBasic - - -# FilterExampleBasic -# Get property names -property_names = list() -for p in collection.config.get().properties: - property_names.append(p.name) - -query_strings = [""] - - -def filter_demo(collection: Collection, property_names: List[str], query_strings: List[str]): - from weaviate.classes.query import Filter - - for query_string in query_strings: - print("\n" + "=" * 40 + f"\nHits for: '{query_string}'" + "\n" + "=" * 40) - for property_name in property_names: - # highlight-start - response = collection.query.fetch_objects( - filters=Filter.by_property(property_name).equal(query_string), - ) - # highlight-end - if len(response.objects) > 0: - print(f">> '{property_name}' matches") - for obj in response.objects: - print(obj.properties[property_name]) - - -filter_demo(collection, property_names, query_strings) -# END FilterExampleBasic - -client.connect() - -# ClarkExample -filter_demo(collection, property_names, ["clark", "Clark", "clark:", "Clark:", "lois clark", "clark lois"]) -# END ClarkExample - -""" -# ClarkResults -======================================== -Hits for: 'clark' -======================================== ->> 'text_word' matches -Lois & Clark: The New Adventures of Superman - -======================================== -Hits for: 'Clark' -======================================== ->> 'text_word' matches -Lois & Clark: The New Adventures of Superman - -======================================== -Hits for: 'clark:' -======================================== ->> 'text_word' matches -Lois & Clark: The New Adventures of Superman ->> 'text_lowercase' matches -Lois & Clark: The New Adventures of Superman - -======================================== -Hits for: 'Clark:' -======================================== ->> 'text_word' matches -Lois & Clark: The New Adventures of Superman ->> 'text_lowercase' matches -Lois & Clark: The New Adventures of Superman ->> 'text_whitespace' matches -Lois & Clark: The New Adventures of Superman - -======================================== -Hits for: 'lois clark' -======================================== ->> 'text_word' matches -Lois & Clark: The New Adventures of Superman - -======================================== -Hits for: 'clark lois' -======================================== ->> 'text_word' matches -Lois & Clark: The New Adventures of Superman -# END ClarkResults -""" - -# MouseExample -filter_demo(collection, property_names, ["computer mouse", "a computer mouse", "the computer mouse", "blue computer mouse"]) -# END MouseExample - -""" -# MouseResults -======================================== -Hits for: 'computer mouse' -======================================== ->> 'text_word' matches -computer mouse -Computer Mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_lowercase' matches -computer mouse -Computer Mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_whitespace' matches -computer mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_field' matches -computer mouse - -======================================== -Hits for: 'a computer mouse' -======================================== ->> 'text_word' matches -computer mouse -Computer Mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_lowercase' matches -computer mouse -Computer Mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_whitespace' matches -computer mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_field' matches -a computer mouse - -======================================== -Hits for: 'the computer mouse' -======================================== ->> 'text_word' matches -computer mouse -Computer Mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_lowercase' matches -computer mouse -Computer Mouse -mouse computer -computer mouse pad -a computer mouse ->> 'text_whitespace' matches -computer mouse -mouse computer -computer mouse pad -a computer mouse - -======================================== -Hits for: 'blue computer mouse' -======================================== -# END MouseResults -""" - -# UnderscoreExample -filter_demo(collection, property_names, ["variable_name"]) -# END UnderscoreExample - -""" -# UnderscoreResults -======================================== -Hits for: 'variable_name' -======================================== ->> 'text_word' matches -variable_name -Variable_Name -Variable Name -a_variable_name -the_variable_name -variable_new_name ->> 'text_lowercase' matches -variable_name -Variable_Name ->> 'text_whitespace' matches -variable_name ->> 'text_field' matches -variable_name -# END UnderscoreResults -""" - -client.close() diff --git a/docs/academy/py/tokenization/_snippets/400_searches.py b/docs/academy/py/tokenization/_snippets/400_searches.py deleted file mode 100644 index bb3d1ac7d..000000000 --- a/docs/academy/py/tokenization/_snippets/400_searches.py +++ /dev/null @@ -1,234 +0,0 @@ -# FilterExampleBasic -import weaviate -from weaviate.classes.query import MetadataQuery -from weaviate.collections import Collection -from typing import List - -# END FilterExampleBasic - -client = weaviate.connect_to_local() - -# FilterExampleBasic -# Instantiate your client (not shown). e.g.: -# client = weaviate.connect_to_weaviate_cloud(...) or -# client = weaviate.connect_to_local() - -# END FilterExampleBasic - -# FilterExampleBasic -collection = client.collections.use("TokenizationDemo") - -# END FilterExampleBasic - - -# FilterExampleBasic -# Get property names -property_names = list() -for p in collection.config.get().properties: - property_names.append(p.name) - -query_strings = [""] - - -def search_demo(collection: Collection, property_names: List[str], query_strings: List[str]): - from weaviate.classes.query import MetadataQuery - - for query_string in query_strings: - print("\n" + "=" * 40 + f"\nBM25 search results for: '{query_string}'" + "\n" + "=" * 40) - for property_name in property_names: - # highlight-start - response = collection.query.bm25( - query=query_string, - return_metadata=MetadataQuery(score=True), - query_properties=[property_name] - ) - # highlight-end - if len(response.objects) > 0: - print(f">> '{property_name}' search results") - for obj in response.objects: - print(obj.properties[property_name], round(obj.metadata.score, 3)) - - -search_demo(collection, property_names, query_strings) -# END FilterExampleBasic - -client.connect() - -# ClarkExample -search_demo(collection, property_names, ["clark", "Clark", "clark:", "Clark:", "lois clark", "clark lois"]) -# END ClarkExample - -""" -# ClarkResults -======================================== -BM25 search results for: 'clark' -======================================== ->> 'text_word' search results -Lois & Clark: The New Adventures of Superman 0.613 - -======================================== -BM25 search results for: 'Clark' -======================================== ->> 'text_word' search results -Lois & Clark: The New Adventures of Superman 0.613 - -======================================== -BM25 search results for: 'clark:' -======================================== ->> 'text_word' search results -Lois & Clark: The New Adventures of Superman 0.613 ->> 'text_lowercase' search results -Lois & Clark: The New Adventures of Superman 0.48 - -======================================== -BM25 search results for: 'Clark:' -======================================== ->> 'text_word' search results -Lois & Clark: The New Adventures of Superman 0.613 ->> 'text_lowercase' search results -Lois & Clark: The New Adventures of Superman 0.48 ->> 'text_whitespace' search results -Lois & Clark: The New Adventures of Superman 0.48 - -======================================== -BM25 search results for: 'lois clark' -======================================== ->> 'text_word' search results -Lois & Clark: The New Adventures of Superman 1.226 ->> 'text_lowercase' search results -Lois & Clark: The New Adventures of Superman 0.48 - -======================================== -BM25 search results for: 'clark lois' -======================================== ->> 'text_word' search results -Lois & Clark: The New Adventures of Superman 1.226 ->> 'text_lowercase' search results -Lois & Clark: The New Adventures of Superman 0.48 -# END ClarkResults -""" - -# MouseExample -search_demo(collection, property_names, ["computer mouse", "a computer mouse", "the computer mouse", "blue computer mouse"]) -# END MouseExample - -""" -# MouseResults -======================================== -BM25 search results for: 'computer mouse' -======================================== ->> 'text_word' search results -mouse computer 0.889 -Computer Mouse 0.889 -computer mouse 0.889 -a computer mouse 0.764 -computer mouse pad 0.764 ->> 'text_lowercase' search results -mouse computer 0.819 -Computer Mouse 0.819 -computer mouse 0.819 -a computer mouse 0.688 -computer mouse pad 0.688 ->> 'text_whitespace' search results -mouse computer 1.01 -computer mouse 1.01 -a computer mouse 0.849 -computer mouse pad 0.849 ->> 'text_field' search results -computer mouse 0.982 - -======================================== -BM25 search results for: 'a computer mouse' -======================================== ->> 'text_word' search results -mouse computer 0.889 -Computer Mouse 0.889 -computer mouse 0.889 -a computer mouse 0.764 -computer mouse pad 0.764 ->> 'text_lowercase' search results -a computer mouse 1.552 -mouse computer 0.819 -Computer Mouse 0.819 -computer mouse 0.819 -computer mouse pad 0.688 ->> 'text_whitespace' search results -a computer mouse 1.712 -mouse computer 1.01 -computer mouse 1.01 -computer mouse pad 0.849 ->> 'text_field' search results -a computer mouse 0.982 - -======================================== -BM25 search results for: 'the computer mouse' -======================================== ->> 'text_word' search results -mouse computer 0.889 -Computer Mouse 0.889 -computer mouse 0.889 -a computer mouse 0.764 -computer mouse pad 0.764 ->> 'text_lowercase' search results -mouse computer 0.819 -Computer Mouse 0.819 -computer mouse 0.819 -a computer mouse 0.688 -computer mouse pad 0.688 -Lois & Clark: The New Adventures of Superman 0.48 ->> 'text_whitespace' search results -mouse computer 1.01 -computer mouse 1.01 -a computer mouse 0.849 -computer mouse pad 0.849 - -======================================== -BM25 search results for: 'blue computer mouse' -======================================== ->> 'text_word' search results -mouse computer 0.889 -Computer Mouse 0.889 -computer mouse 0.889 -a computer mouse 0.764 -computer mouse pad 0.764 ->> 'text_lowercase' search results -mouse computer 0.819 -Computer Mouse 0.819 -computer mouse 0.819 -a computer mouse 0.688 -computer mouse pad 0.688 ->> 'text_whitespace' search results -mouse computer 1.01 -computer mouse 1.01 -a computer mouse 0.849 -computer mouse pad 0.849 -# END MouseResults -""" - -# UnderscoreExample -search_demo(collection, property_names, ["variable_name"]) -# END UnderscoreExample - -""" -# UnderscoreResults -======================================== -BM25 search results for: 'variable_name' -======================================== ->> 'text_word' search results -Variable Name 0.716 -Variable_Name 0.716 -variable_name 0.716 -variable_new_name 0.615 -the_variable_name 0.615 -a_variable_name 0.615 ->> 'text_lowercase' search results -Variable_Name 0.97 -variable_name 0.97 ->> 'text_whitespace' search results -variable_name 1.27 ->> 'text_field' search results -variable_name 0.982 -# END UnderscoreResults -""" - -client.close() diff --git a/docs/academy/py/tokenization/index.md b/docs/academy/py/tokenization/index.md deleted file mode 100644 index 86a75ab16..000000000 --- a/docs/academy/py/tokenization/index.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "275 (Keyword) Tokenization" -description: Dive into tokenization in Weaviate to enhance text search accuracy. -sidebar_position: 275 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js'; - -## Course overview - -:::info Pre-requisites -This course is self-contained. However, we recommend that you go through one of the 101-level courses, such as that for working with [text](../starter_text_data/index.md), [your own vectors](../starter_custom_vectors/index.md), or [multimodal data](../starter_multimodal_data/index.md). -::: - -This course will introduce you to tokenization, and how it relates to Weaviate. Specifically, it will discuss what it is, how it relates to search and how to configure it. - -Note that tokenization is a concept that applies to keyword search and filtering, as well as in the context of language models. - -**This course focuses on the keyword aspect**, but will briefly discuss how tokenization impacts language models. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/vector_index/100_overview.mdx b/docs/academy/py/vector_index/100_overview.mdx deleted file mode 100644 index 38cc48f26..000000000 --- a/docs/academy/py/vector_index/100_overview.mdx +++ /dev/null @@ -1,95 +0,0 @@ ---- -title: "Vector index: Overview" ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -A database index is a data structure that organizes data to make searches more efficient. Think of it as a table of contents in a book, or an index in a library, that helps you find the information you need quickly. - -
- Card catalog from page 167 of 'Manual of library classification and shelf arrangement' (1898) -
- -A [vector index](/weaviate/concepts/indexing/vector-index.md) is a specialized type of index that is designed to store and search vectors. - -The choice and configuration of your vector index can significantly impact the performance of your imports as well as searches, and the resource requirements of your Weaviate instance. - -For this reason, the vector index a critical component in Weaviate. - -## Why use a vector index? - -Databases can quickly grow very large, to the point where time to search for a specific item can become unacceptably long, or the resources required to search become too high. - -A vector index is designed to improve users' experience of searching for items in a large database. - -It usually makes a trade-off to balance three key factors: search speed, accuracy, and resource requirements. - -## Vector index types - -Many different types of vector indexes exist. A majority of them are designed to speed up searches by reducing the number of vectors that need to be compared. However, they do this in different ways, and each has its own strengths and weaknesses. - -### Graph indexes - -Graph indexes form a network of vectors, such that similar vectors are connected to each other. This allows for fast "traversal" of the graph to find similar vectors to a query vector. - -import HNSWDiagram from './_snippets/hnsw_diagram.png'; - -
- Outline of HNSW graph, showing nodes connected in multiple layers -
- -HNSW, or "Hierarchical Navigable Small World", is the most common graph index type. It creates a set of "layers" of vectors, to enable fast traversal of the graph. - -They are very scalable, allow incremental updates, and efficient for high-dimensional vectors. - -This is the default index type in Weaviate. - -### Tree-based indexes - -Tree-based indexes divide the vectors into a tree structure. - -
- Complete binary tree (Wikipedia) -
- -ANNOY, or "Approximate Nearest Neighbors Oh Yeah", is a well-known tree-based index. It divides the vectors into a binary tree structure. - -They can be memory-efficient, and are good for low-dimensional vectors. - -Given its nature as a tree, it may be costly to update the index over time. This would depend on whether the tree needs to be rebuilt, or if it can be updated incrementally. - -ANNOY itself does not support incremental updates. - -### Cluster-based indexes - -Cluster-based indexes group vectors based on their similarity. As a result, the search space is reduced to only the cluster(s) that is most likely to contain the nearest neighbors. - -Their search accuracy (recall and precision) may generally be lower than graph-based indexes, but they can be more memory-efficient. - -### Flat index - -A flat index is the simplest type of index. It stores all vectors in a single list, and searches through all of them to find the nearest neighbors. - -This is extremely memory-efficient, but does not scale well at all, as the search time grows linearly with the number of vectors. - -### Available vector indexes in Weaviate - -Weaviate supports multiple types of vector indexes - namely, `hnsw`, `flat`, and `dynamic`. - -We will discuss these in more detail in the following sections. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/vector_index/200_hnsw.mdx b/docs/academy/py/vector_index/200_hnsw.mdx deleted file mode 100644 index 30e9368cd..000000000 --- a/docs/academy/py/vector_index/200_hnsw.mdx +++ /dev/null @@ -1,195 +0,0 @@ ---- -title: HNSW index in depth ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -HNSW, or "Hierarchical Navigable Small World", is a powerful and flexible vector index type that allows for fast and accurate searches of high-dimensional vectors. - -It also has the advantage of being very scalable, while being tunable to allow for different trade-offs between search speed, accuracy, and resource requirements. - -HNSW is the default index type in Weaviate, and if you are not sure which index type to use, you should start with HNSW. - -## Key ideas - -HNSW is all about creating connections between vectors in a way that allows for both fast global traversal of the graph and accurate search of similar vectors. - -It does this by creating multiple layers of vectors, where each layer includes a subset of the vectors in the layer below. This means that as you move up the layers, finding the correct general area of the graph becomes faster. - -And once the general area is found, the search then becomes more local, by moving down the layers, where more of the vectors are included. - -Ultimately, the search reaches the bottom layer, which includes all the available vectors. - -import HNSWDiagram from './_snippets/hnsw_diagram.png'; - -
- Outline of HNSW graph, showing nodes connected in multiple layers -
- -This method allows a search to quickly find the right general area of the graph before carrying out a comprehensive search of the dense bottom layer. - -Intuitively, an HNSW graph can be thought of as a high-dimensional skip list of sorts, where the higher layers are used to global search, and the lower layers are used for local search. - -### Trade-offs - -HNSW is a very flexible index type that allows for a wide range of trade-offs. - -The key trade-offs are between search speed, accuracy, and resource requirements. These trade-offs can be made by tuning the parameters of the HNSW index, as well as by [quantizing the vectors](../compression/index.md). - -### Resource requirements - -HNSW is an in-memory index, where each node in the graph as well as each edge between nodes are stored in memory. - -This means that the size of the index in memory is directly proportional to the number of vectors in the index, as well as the number of connections between vectors. - -The size of an HNSW index is dominated by the number of vectors; take a look at the table below for an example: - -| Component | Size derivation | Typical size | Size @1M vectors | Size @100M vectors | -| --- | --- | --- | --- | --- | -| Node | 4B (float) x N dimensions | 2-12kB | 2-12GB | 200-1200GB | -| Edge | 10B x 20 connections | 200B | 200MB | 20GB | - -As you can see, the memory requirements of an HNSW index can quickly become a bottleneck. This is where [quantization](../compression/index.md) can be used to reduce the size of the index in memory. - -### Distance metric - -![Vector Distance Calculations](./img/vector_distance.png) - -The distance metric used in the index determines how the distance between vectors is calculated. In an HNSW index, it impacts where each vector is placed in the graph. - -You must choose a metric that suits the vectors in your collection. To find this, consult the documentation of the model that generated your vectors. - -Weaviate's default metric is cosine, but you can also use any number of [other available metrics](/weaviate/config-refs/distances.md). - -If you are unsure, the cosine distance is a good, robust, default choice that is used by a majority of models. - -### Specify HNSW as the index type - -HNSW is Weaviate's default vector index type. So, if you do not specify a collection to use a specific index type, it will use HNSW. - -But you can explicitly specify it as follows: - - - -## Tuning HNSW - -An HNSW index can be tuned to achieve different trade-offs between search speed, accuracy, and resource requirements. - -The key aspects to tune are: -- The number of connections between nodes, -- The size of a "dynamic list", and -- Quantization - -### Number of connections - -import maxConnectionsDiagram from './_snippets/maxConnections.png'; - -
- Outline of HNSW graph, highlighting connections -
- -The maximum number of connections between nodes (`maxConnections`) determine how densely the graph is connected. - -A higher number of connections will allow for more accurate searches, but will also slow down searches, and require more memory. - -The default value is `32`. Note that on the bottom layer of the graph each node can have up to (`2 * maxConnections`) connections. - -### Dynamic list size - -The "dynamic list" in HNSW refers to the list of nodes that are considered by the algorithm. Note that dynamic lists are used in two different contexts: - -During search, the dynamic list is used to keep track of the nodes that are being considered, and to ensure that the search is comprehensive. - -During index construction, the dynamic list is used to keep track of candidate nodes that are being considered for connection. The HNSW algorithm will then choose the best `maxConnections` connections from the dynamic list, taking into account not only proximity but also aspects such as overall connectivity of the graph. - -#### Search dynamic list size - -import EfDiagram from './_snippets/ef.png'; - -
- Outline of HNSW graph, with a hypothetical dynamic list -
- -You can set the dynamic list size for search statically or dynamically. - -To set it statically, provide the `ef` parameter when creating the collection. The default value is `-1`, which defers this to a dynamic setting. - -To set it dynamically, provide a combination of `dynamicEfMin`, `dynamicEfMax` and `dynamicEfFactor`. - -The dynamic list size will be set as the query limit multiplied by `dynamicEfFactor`, modified by a minimum of `dynamicEfMin` and a maximum of `dynamicEfMax`. - -In code, this can be expressed as: - -```python -ef = min(max(dynamicEfMin, queryLimit * dynamicEfFactor), dynamicEfMax) -``` - -The default values are `dynamicEfMin=100`, `dynamicEfMax=500`, and `dynamicEfFactor=8`. - -#### Index construction dynamic list size - -import EfConstructionDiagram from './_snippets/efConstruction.png'; - -
- Outline of HNSW graph, with a note for Ef -
- -To set the dynamic list size for index construction, provide the `efConstruction` parameter when creating the collection. - -This will improve your search performance, at a cost of the index construction process. The default value is `128`. - -### Quantization - -Enabling quantization with HNSW reduces the size of the index in memory by using compressed vectors. Note that the full vector is still stored on disk, which is used to rescore the vectors after they are fetched from the index. - -This can be a powerful way to reduce the memory requirements of your Weaviate instance, especially if you have a large number of vectors. - -Learn more about quantization [with this Weaviate Academy course](../compression/index.md). - -## Configure HNSW in Weaviate - -Each of these parameters can be provided when creating a collection in Weaviate. Note that out of the discussed parameters, only the `dynamicEf` related parameters are mutable. - -### Code example - - - -### Further options - -There are more, advanced HNSW parameters that can be set in Weaviate. These are not typically needed for most use cases, but can be useful in specific situations. - -#### Collection-level parameters - -- `cleanup_interval_seconds`: Sets the interval at which a cleanup process is triggered for deleted nodes. -- `flat_search_cutoff`: Sets the number below which a brute-force search is used instead of the HNSW index. -- `vector_cache_max_objects` : Sets the maximum number of vectors that can be cached in memory. - -#### Environment variables - -- `PERSISTENCE_HNSW_MAX_LOG_SIZE`: Maximum size of the HNSW [write-ahead-log](/weaviate/concepts/storage.md#hnsw-vector-index-storage). Increase this to improve log compaction efficiency, or decrease to reduce memory requirements. - -## Further resources - -- [Concepts: Vector index](/weaviate/concepts/indexing/vector-index.md) -- [References: Vector index parameters](/weaviate/config-refs/indexing/vector-index.mdx) -- [How-to manage collections](../../../weaviate/manage-collections/index.mdx) -- [Weaviate Academy: Compression](../compression/index.md) - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/vector_index/220_flat.mdx b/docs/academy/py/vector_index/220_flat.mdx deleted file mode 100644 index a71f9236e..000000000 --- a/docs/academy/py/vector_index/220_flat.mdx +++ /dev/null @@ -1,75 +0,0 @@ ---- -title: Flat index in depth ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -The "flat" index is a simple and efficient vector index type that is best suited for small collections of vectors. - -## Key ideas - -The flat index is a very simple vector index that mimics a "map" data type. It simply stores the location of each vector, such that a search can be done by comparing the query vector to each vector in the collection. As you might expect, this leads to very low resource requirements, at the cost of search speed as the number of vectors increases. - -Where this index type shines is in large use cases where there are a high number of small collections, such as one for each end-user in a multi-tenant environment. A basic example may be a notes application, where each end user has their own collection of notes. - -In such an environment, each end user will be treated as a "tenant", in a multi-tenant collection, and each tenant will have their own vector index. This is a perfect use case for the flat index. - -### Trade-offs - -The key compromise with the flat index is that it is not scalable. As the number of vectors in the collection increases, the search time will increase linearly, as each vector must be compared to the query vector. - -### Resource requirements - -The flat index has very low memory requirements, as it only needs to store the location to the vector, and not the vector itself. - -This can be very beneficial as the overall database grows, especially if the growth is primarily in the number of indexes (tenants), rather than the number of vectors in each index. - -### Distance metric - -![Vector Distance Calculations](./img/vector_distance.png) - -The distance metric used in the index determines how the distance between vectors is calculated. - -You must choose a metric that suits the vectors in your collection. To find this, consult the documentation of the model that generated your vectors. - -Weaviate's default metric is cosine, but you can also use any number of [other available metrics](/weaviate/config-refs/distances.md). - -If you are unsure, the cosine distance is a good, robust, default choice that is used by a majority of models. - -### Quantization - -Enabling quantization with the flat reduces the search time by using compressed vectors. Note that the full vector is still stored, which is used to rescore the vectors after they are fetched during initial search. - -This can improve the search speed to mitigate the linear increase in search time as the number of vectors increases. However, the scalability of the flat index is still limited. - -Learn more about quantization [with this Weaviate Academy course](../compression/index.md). - -## Configure flat index in Weaviate - -Each of these parameters can be provided when creating a collection in Weaviate. Note that the `vector_cache_max_objects` is only used if quantization is enabled with vector caching enabled within it. - -### Code example - - - -## Further resources - -- [Concepts: Vector index](/weaviate/concepts/indexing/vector-index.md) -- [References: Vector index parameters](/weaviate/config-refs/indexing/vector-index.mdx) -- [How-to manage collections](../../../weaviate/manage-collections/index.mdx) -- [Weaviate Academy: Compression](../compression/index.md) - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/vector_index/250_dynamic.mdx b/docs/academy/py/vector_index/250_dynamic.mdx deleted file mode 100644 index 15849e020..000000000 --- a/docs/academy/py/vector_index/250_dynamic.mdx +++ /dev/null @@ -1,80 +0,0 @@ ---- -title: Dynamic index in depth ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PyCode from '!!raw-loader!./_snippets/100_config.py'; - -The "dynamic" index is a "best of both worlds" approach that combines the benefits of the `hnsw` and `flat` indexes. - -import DynamicAsyncRequirements from '/_includes/dynamic-index-async-req.mdx'; - - - -## Key ideas - -Simply put, the `dynamic` index is a `flat` index that is automatically converted to an `hnsw` index when the number of vectors in the collection exceeds a predetermined threshold (10,000 by default). - -The motivation for this is that the `flat` index is very efficient for small collections, but its search time increases linearly with the number of vectors in the collection. The `hnsw` index, on the other hand, is more efficient for large collections, but includes a memory overhead with little benefit for small collections. - -The `dynamic` index is a good choice if you do not know how big the size of each collection will be, or if you expect some tenants to grow much larger than others. - -In a multi-tenancy configuration, this will mean that all tenants will start with the `flat` index, but will automatically switch to the `hnsw` index when the number of vectors in the collection exceeds the threshold. - -Currently, this is a one-way conversion, meaning that once the index is converted to `hnsw`, it will not be converted back to `flat` if it subsequently falls below the threshold. - -### Distance metric - -![Vector Distance Calculations](./img/vector_distance.png) - -The distance metric used in the index determines how the distance between vectors is calculated. In an HNSW index, it impacts where each vector is placed in the graph. - -You must choose a metric that suits the vectors in your collection. To find this, consult the documentation of the model that generated your vectors. - -Weaviate's default metric is cosine, but you can also use any number of [other available metrics](/weaviate/config-refs/distances.md). - -If you are unsure, the cosine distance is a good, robust, default choice that is used by a majority of models. - -## Configure dynamic index in Weaviate - -Each of these parameters can be provided when creating a collection in Weaviate. Note that the `vector_cache_max_objects` is only used if quantization is enabled with vector caching enabled within it. - -### Basic configuration - -Set a collection to use the dynamic index as shown below. - - - -### Custom configuration - -You can set the threshold at which the `flat` index will be converted to `hnsw`. - -Additionally, you can specify any of the `flat` and `hnsw` index parameters that will be used depending on the state of the index. - - - -## Further resources - -- [Concepts: Vector index](/weaviate/concepts/indexing/vector-index.md) -- [References: Vector index parameters](/weaviate/config-refs/indexing/vector-index.mdx) -- [How-to manage collections](../../../weaviate/manage-collections/index.mdx) -- [Weaviate Academy: Compression](../compression/index.md) - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - diff --git a/docs/academy/py/vector_index/900_next_steps.mdx b/docs/academy/py/vector_index/900_next_steps.mdx deleted file mode 100644 index 304bec442..000000000 --- a/docs/academy/py/vector_index/900_next_steps.mdx +++ /dev/null @@ -1,37 +0,0 @@ ---- -title: Wrap-up / Next steps ---- - -Congratulations! You have completed this course on vector indexes. We hope you found it helpful and informative. - -## Index selection - -When choosing an index type, use the following as a guide: - -- **Flat index**: Use for small collections with a known size. -- **HNSW index**: Use for large collections with a known size. -- **Dynamic index**: Use for collections with an unknown size or collections that may grow over time. - -Also, in a multi-tenant environment, the "dynamic" index may be a good default choice, as it will allow some tenants to remain in the `flat` index while others are automatically converted to `hnsw` when they grow. - -## Further resources - -These resources will help you continue your learning journey: - -- [Concepts: Vector index](/weaviate/concepts/indexing/vector-index.md) -- [References: Vector index parameters](/weaviate/config-refs/indexing/vector-index.mdx) -- [How-to manage collections](../../../weaviate/manage-collections/index.mdx) -- [Weaviate Academy: Compression](../compression/index.md) -- [Weaviate Academy: Named vectors](../named_vectors/index.md) - -import CTASocials from '../_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/vector_index/_snippets/100_config.py b/docs/academy/py/vector_index/_snippets/100_config.py deleted file mode 100644 index c267700f4..000000000 --- a/docs/academy/py/vector_index/_snippets/100_config.py +++ /dev/null @@ -1,136 +0,0 @@ -import weaviate - -client = weaviate.connect_to_local() - -collection_name = "CollectionWithHNSW" - -client.collections.delete(name=collection_name) - -# START ConfigHNSW -from weaviate.classes.config import Configure - -client.collections.create( - name=collection_name, - # ... other parameters - # highlight-start - vector_config=Configure.Vectors.text2vec_openai( - vector_index_config=Configure.VectorIndex.hnsw() - ), - # highlight-end -) -# END ConfigHNSW - -client.collections.delete(name=collection_name) - -# START CustomConfigHNSW -from weaviate.classes.config import Configure, VectorDistances - -client.collections.create( - name=collection_name, - # ... other parameters - # highlight-start - vector_config=Configure.Vectors.text2vec_openai( - vector_index_config=Configure.VectorIndex.hnsw( - # Distance metric - distance_metric=VectorDistances.COSINE, - # Parameters for HNSW index construction - ef_construction=256, # Dynamic list size during construction - max_connections=128, # Maximum number of connections per node - quantizer=Configure.VectorIndex.Quantizer.bq(), # Quantizer configuration - # Parameters for HNSW search - ef=-1, # Dynamic list size during search; -1 enables dynamic Ef - dynamic_ef_factor=15, # Multiplier for dynamic Ef - dynamic_ef_min=200, # Minimum threshold for dynamic Ef - dynamic_ef_max=1000, # Maximum threshold for dynamic Ef - ) - ), - # highlight-end -) -# END CustomConfigHNSW - -client.collections.delete(name=collection_name) - -# START ConfigFlat -from weaviate.classes.config import Configure - -client.collections.create( - name=collection_name, - # ... other parameters - # highlight-start - vector_config=Configure.Vectors.text2vec_openai( - vector_index_config=Configure.VectorIndex.flat() - ), - # highlight-end -) -# END ConfigFlat - -client.collections.delete(name=collection_name) - -# START CustomConfigFlat -from weaviate.classes.config import Configure, VectorDistances - -client.collections.create( - name=collection_name, - # ... other parameters - # highlight-start - vector_config=Configure.Vectors.text2vec_openai( - vector_index_config=Configure.VectorIndex.flat( - distance_metric=VectorDistances.COSINE, # Distance metric - quantizer=Configure.VectorIndex.Quantizer.bq(cache=True), # Quantizer configuration - vector_cache_max_objects=1000000, # Maximum number of objects in the cache - ) - ), - # highlight-end -) -# END CustomConfigFlat - -client.collections.delete(name=collection_name) - -# START ConfigDynamic -from weaviate.classes.config import Configure - -client.collections.create( - name=collection_name, - # ... other parameters - # highlight-start - vector_config=Configure.Vectors.text2vec_openai( - vector_index_config=Configure.VectorIndex.dynamic() - ), - multi_tenancy_config=Configure.multi_tenancy(enabled=True), # Dyanmic index works well with multi-tenancy set-ups - # highlight-end -) -# END ConfigDynamic - -client.collections.delete(name=collection_name) - -# START CustomConfigDynamic -from weaviate.classes.config import Configure, VectorDistances - -client.collections.create( - name=collection_name, - # ... other parameters - # highlight-start - vector_config=Configure.Vectors.text2vec_openai( - vector_index_config=Configure.VectorIndex.dynamic( - distance_metric=VectorDistances.COSINE, # Distance metric - threshold=25000, # Threshold for switching to dynamic index - hnsw=Configure.VectorIndex.hnsw( - # Your preferred HNSW configuration - ), - flat=Configure.VectorIndex.flat( - # Your preferred flat configuration - ), - ) - ), - multi_tenancy_config=Configure.multi_tenancy( # Dyanmic index works well with multi-tenancy set-ups - enabled=True, - auto_tenant_creation=True, - auto_tenant_activation=True, - ), - # highlight-end -) -# END CustomConfigDynamic - -client.collections.delete(name=collection_name) - -client.close() diff --git a/docs/academy/py/vector_index/_snippets/ef.png b/docs/academy/py/vector_index/_snippets/ef.png deleted file mode 100644 index a5c14cd21..000000000 Binary files a/docs/academy/py/vector_index/_snippets/ef.png and /dev/null differ diff --git a/docs/academy/py/vector_index/_snippets/efConstruction.png b/docs/academy/py/vector_index/_snippets/efConstruction.png deleted file mode 100644 index f132d1843..000000000 Binary files a/docs/academy/py/vector_index/_snippets/efConstruction.png and /dev/null differ diff --git a/docs/academy/py/vector_index/_snippets/hnsw_diagram.png b/docs/academy/py/vector_index/_snippets/hnsw_diagram.png deleted file mode 100644 index 51ffb0b77..000000000 Binary files a/docs/academy/py/vector_index/_snippets/hnsw_diagram.png and /dev/null differ diff --git a/docs/academy/py/vector_index/_snippets/maxConnections.png b/docs/academy/py/vector_index/_snippets/maxConnections.png deleted file mode 100644 index 92171af7c..000000000 Binary files a/docs/academy/py/vector_index/_snippets/maxConnections.png and /dev/null differ diff --git a/docs/academy/py/vector_index/img/vector_distance.png b/docs/academy/py/vector_index/img/vector_distance.png deleted file mode 100644 index e64fb2859..000000000 Binary files a/docs/academy/py/vector_index/img/vector_distance.png and /dev/null differ diff --git a/docs/academy/py/vector_index/index.md b/docs/academy/py/vector_index/index.md deleted file mode 100644 index 48f7f4e64..000000000 --- a/docs/academy/py/vector_index/index.md +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: "230 Vector indexing" -description: Implement vector indexing in Weaviate to speed up your search queries. -sidebar_position: 230 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -:::info Pre-requisites -This course is self-contained. However, we recommend that you go through one of the 101-level courses, such as that for working with [text](../starter_text_data/index.md), [your own vectors](../starter_custom_vectors/index.md), or [multimodal data](../starter_multimodal_data/index.md). -::: - -The vector index is a key component of Weaviate's search capabilities. It allows you to search for vectors based on their similarity to a query vector, and to retrieve the objects that are associated with those vectors. - -Weaviate offers multiple types of vector indexes, each with its own strengths and weaknesses. Each index is also configurable, allowing you to tune its performance to your specific use case. - -This course will introduce you to the different types of vector indexes available in Weaviate, and how to configure them to best suit your use case. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/10_intro_weaviate.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/10_intro_weaviate.mdx deleted file mode 100644 index 7a919dc86..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/10_intro_weaviate.mdx +++ /dev/null @@ -1,290 +0,0 @@ ---- -title: Introduction to Weaviate -description: Introduction to Weaviate ---- - -## What is Weaviate? - -import ReactPlayer from 'react-player/lazy' - - -
- -Weaviate is an open-source [vector database](https://weaviate.io/blog/what-is-a-vector-database). But what does that mean? Let's unpack it here. - -### Vector database - -Weaviate is a fantastic tool for retrieving the information you need, quickly and accurately. It does this by being an amazing **vector database**. - -You may be familiar with traditional databases such as relational databases that use SQL. A database can catalog, store and retrieve information. A **vector** database can carry out these tasks also, with the key difference being that they can perform these tasks based on similarity. - -#### How traditional searches work - -Imagine that you are searching a relational database containing articles on cities, to retrieve a list of "major" European cities. Using SQL, you might construct a query like this: - -```sql -SELECT city_name wiki_summary -FROM wiki_city -WHERE (wiki_summary LIKE '%major European city%' OR - wiki_summary LIKE '%important European city%' OR - wiki_summary LIKE '%prominent European city%' OR - wiki_summary LIKE '%leading European city%' OR - wiki_summary LIKE '%significant European city%' OR - wiki_summary LIKE '%top European city%' OR - wiki_summary LIKE '%influential European city%' OR - wiki_summary LIKE '%notable European city%') - (… and so on) -``` - -Which would return cities that contained any of these strings (`major`, `important`, `prominent`, ... etc) in the `wiki_summary` column. - -This works well in many circumstances. However, there are two significant limitations with this approach. - -#### Limitations of traditional search - -Using this type of search requires you to identify terms that *may* have been used to describe the concept, which is no easy feat. - -What's more, this doesn't solve the problem of how to rank the list of resulting objects. - -With the above search query, an entry merely containing a mention of a different European city (i.e. not very relevant) would be given equal weighting to an entry for Paris, or Rome, which would be highly relevant. - -A vector database makes this job simpler by enabling searches based on similarity. - -#### Examples of vector search - -So, you could perform a query like this in Weaviate: - -```graphql -{ - Get { - WikiCity ( - nearText: { concepts: ["Major European city"] } - ) { city_name wiki_summary } - } -} -``` - -And it would return a list of entries that are *ranked by their similarity* to the query - the idea of "Major European city". - -What's more, Weaviate "indexes" the data based on their similarity, making this type of data retrieval lightning-fast. - -Weaviate can help you to do all this, and actually a lot more. Another way to think about Weaviate is that it supercharges the way you use information. - -:::info Vector vs semantic search -A vector search is also referred to as a "semantic search" because it returns results based on the similarity of meaning (therefore "semantic"). -::: - -### Open-source - -Weaviate is open-source. In other words, its [codebase is available online](https://github.com/weaviate/weaviate) for anyone to see and use[(1)](#-notes). - -And that is *the* codebase, regardless of how you use it. So whether you run Weaviate on your own computer, on a cloud computing environment, or through our managed service [Weaviate Cloud (WCD)](https://console.weaviate.cloud/), you are using the exact same technology. - -So, if you want, you can run Weaviate for free on your own device, or use our managed service for convenience. You can also take comfort in that you can see exactly what you are running, and be a part of the open-source community, as well as to shape its development. - -It also means that your knowledge about Weaviate is fungible, between local, cloud, or managed instances of Weaviate. So anything you learn here about Weaviate using WCD will be equally applicable to running it locally, and vice versa. 😉 - -### Information, made dynamic - -We are used to thinking of information as static, like a book. But with Weaviate and modern AI-driven language models, we can do much more than just retrieve static information but easily build on top of it. Take a look at these examples: - -#### Question answering - -Given a list of Wikipedia entries, you could ask Weaviate: - -:::note We asked Weaviate: -When was Lewis Hamilton born? -::: - -And it would answer with: - -:::note Weaviate responded: -Lewis Hamilton was born on January 7, 1985. ([check for yourself](https://en.wikipedia.org/wiki/Lewis_Hamilton)) -::: - -
- See the full query & response - -#### Query - -```graphql -{ - Get { - WikiArticle ( - ask: { - question: "When was Lewis Hamilton born?", - properties: ["wiki_summary"] - }, - limit: 1 - ) { - title - _additional { - answer { - result - } - } - } - } -} -``` - -#### Response - -```json -{ - "data": { - "Get": { - "WikiArticle": [ - { - "_additional": { - "answer": { - "result": " Lewis Hamilton was born on January 7, 1985." - } - }, - "title": "Lewis Hamilton" - } - ] - } - } -} -``` - -
- -#### Generative search - -Or you can synthesize passages using retrieved information with Weaviate: - -Here is one, where we searched Weaviate for an entry on a "racing driver", and produce the result in the format of: - -:::note We asked Weaviate: -Write a fun tweet encouraging people to read about this: ## \{title} by summarizing highlights from: ## \{wiki_summary} -::: - -Which produces: - -:::note Weaviate responded: -Check out the amazing story of Lewis Hamilton, the 7-time Formula One World Drivers' Championship winner! From his humble beginnings to becoming one of the world's most influential people, his journey is an inspiring one. #LewisHamilton #FormulaOne #Motorsport #Racing -::: - -
- See the full query & response - -#### Query - -```graphql -{ - Get { - WikiArticle( - nearText: { - concepts: ["Racing Driver"] - } - limit: 1 - ) { - title - wiki_summary - _additional { - generate( - singleResult: { - prompt: """ - Write a fun tweet encouraging people to read about this: ## {title} - by summarizing highlights from: ## {wiki_summary} - """ - } - ) { - singleResult - error - } - } - } - } -} -``` - -#### Response - -```json -{ - "data": { - "Get": { - "WikiArticle": [ - { - "_additional": { - "generate": { - "error": null, - "singleResult": "Check out the amazing story of Lewis Hamilton, the 7-time Formula One World Drivers' Championship winner! From his humble beginnings to becoming a global icon, his journey is an inspiring one. #LewisHamilton #FormulaOne #Motorsport #Racing #Inspiration" - } - }, - "title": "Lewis Hamilton", - "wiki_summary": "Sir Lewis Carl Davidson Hamilton (born 7 January 1985) is a British racing driver currently competing in Formula One, driving for Mercedes-AMG Petronas Formula One Team. In Formula One, Hamilton has won a joint-record seven World Drivers' Championship titles (tied with Michael Schumacher), and holds the records for the most wins (103), pole positions (103), and podium finishes (191), among others.\nBorn and raised in Stevenage, Hertfordshire, Hamilton joined the McLaren young driver programme in 1998 at the age of 13, becoming the youngest racing driver ever to be contracted by a Formula One team. This led to a Formula One drive with McLaren for six years from 2007 to 2012, making Hamilton the first black driver to race in the series. In his inaugural season, Hamilton set numerous records as he finished runner-up to Kimi R\u00e4ikk\u00f6nen by one point. The following season, he won his maiden title in dramatic fashion\u2014making a crucial overtake at the last corner on the last lap of the last race of the season\u2014to become the then-youngest Formula One World Champion in history. After six years with McLaren, Hamilton signed with Mercedes in 2013.\nChanges to the regulations for 2014 mandating the use of turbo-hybrid engines saw the start of a highly successful period for Hamilton, during which he won six further drivers' titles. Consecutive titles came in 2014 and 2015 during an intense rivalry with teammate Nico Rosberg. Following Rosberg's retirement in 2016, Ferrari's Sebastian Vettel became Hamilton's closest rival in two championship battles, in which Hamilton twice overturned mid-season point deficits to claim consecutive titles again in 2017 and 2018. His third and fourth consecutive titles followed in 2019 and 2020 to equal Schumacher's record of seven drivers' titles. Hamilton achieved his 100th pole position and race win during the 2021 season. \nHamilton has been credited with furthering Formula One's global following by appealing to a broader audience outside the sport, in part due to his high-profile lifestyle, environmental and social activism, and exploits in music and fashion. He has also become a prominent advocate in support of activism to combat racism and push for increased diversity in motorsport. Hamilton was the highest-paid Formula One driver from 2013 to 2021, and was ranked as one of the world's highest-paid athletes by Forbes of twenty-tens decade and 2021. He was also listed in the 2020 issue of Time as one of the 100 most influential people globally, and was knighted in the 2021 New Year Honours. Hamilton was granted honorary Brazilian citizenship in 2022.\n\n" - } - ] - } - } -} -``` - -
- - -We will cover these and many more capabilities, such as vectorization, summarization and classification, in our units. - -For now, keep in mind that Weaviate is a vector database at its core which can also leverage AI tools to do more with the retrieved information. - -## Review - -In this section, you learned about what Weaviate is and how it works at a very high level. You have also been introduced to what vector search is at a high level, that it is a similarity-based search method. - -### Review exercises - - - - - - - -### Key takeaways - -- Weaviate is an open source vector database. -- The core Weaviate library is the same whether you run it locally, on the cloud, or with WCD. -- Vector searches are similarity-based searches. -- Weaviate can also transform your data after retrieving it before returning it to you. - -## Notes - -(1) Subject to terms of its license, of course. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const weaviateOpenSource = [ - { - questionText: 'What is the difference in the Weaviate codebase between local and cloud deployments?', - answerOptions: [ - { answerText: 'Cloud deployments always include additional modules.', isCorrect: false, feedback: 'Cloud deployments of Weaviate do not include any special, or additional, modules.'}, - { answerText: 'Local deployments are optimized for GPU use.', isCorrect: false, feedback: 'GPU usage can be enabled for inference whether locally or remotely deployed.'}, - { answerText: 'Cloud deployments are optimized for scalability.', isCorrect: false, feedback: 'We agree that cloud deployments should be optimized for scalability. But the Weaviate codebase is built for scalability regardless of deployment location.'}, - { answerText: 'None, they are the same.', isCorrect: true, feedback: 'They are the same, open-source codebase available on GitHub.'}, - ], - }, -]; -export const vectorSearchDefinition = [ - { - questionText: 'What is the best description of vector search?', - answerOptions: [ - { answerText: 'Vector search is a directional search.', isCorrect: false, feedback: 'The definition of "vector" in this context is not direction-related.'}, - { answerText: 'Vector search is a similarity-based search.', isCorrect: true, feedback: 'It searches a data collection or database for proximity in its representation of "meaning".'}, - { answerText: 'Vector search is a number-based search.', isCorrect: false, feedback: 'This is partially true, but not the best answer. While there are numbers involved, that description does not quite capture the key concept of vector searches.'}, - ], - }, -]; diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/15_overview_vectors.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/15_overview_vectors.mdx deleted file mode 100644 index 1ceef3297..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/15_overview_vectors.mdx +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: Vectors - An overview -description: Vector Overview in Weaviate ---- - -## What is a vector? - -import ReactPlayer from 'react-player/lazy' - - -
- -We've covered that Weaviate is a vector database, and that a vector search is similarity-based. But, what is a vector? - -A vector in this context is just a series of numbers like `[1, 0]` or `[0.513, 0.155, 0.983, ..., 0.001, 0.932]`. Vectors like these are used to capture meaning. - -This might seem like an odd concept. But in fact, you may have already used vectors to capture meaning without realizing it. If you have tried photo editing, or used MS Paint you might have encountered the RGB color system. - -### How do numbers represent meaning? - -The RGB system uses groups of three numbers to represent colors. For example: - -- (255, 0, 0) = red -- (80, 200, 120) = emerald - -In these examples, each number can be thought of as a dial for how red, green or blue a color is. - -Now, imagine having hundreds, or even thousands, of these dials. That’s how vectors are used to represent meaning. Modern machine learning models such as GPT-x, or those used with Weaviate, use vectors to represent some "essence", or "meaning" of objects. This can be done for any object type, such as text, code, images, videos and more. - -## Vector embeddings in Weaviate - - - -The vector representation of an object's meaning is called a "vector embedding". - -Weaviate enables vector searches by indexing and storing data objects and their corresponding vector embeddings. The vector embeddings come from machine learning models. - -In plain terms, Weaviate processes and organizes your data in such a way that objects can be retrieved based on their similarity to a query. To perform these tasks at speed, Weaviate does two things that traditional databases do not. Weaviate: - -- Quantifies similarity -- Indexes vector data - -These operations enable Weaviate to do what it does. - -### Quantifying similarity - -As we've mentioned, vector searches are similarity-based, but what does that actually mean? How do we determine that two pieces of data are "similar"? What does it mean for two pieces of text, two images, or two objects in general, to be similar? - -This is a relatively simple idea that is actually incredibly interesting and intricate once we start to dive into the details. - -But for now, you should know that machine learning (ML) models are the key to this whole process. The ML models that power vector searches share similarities with those that generate text responses from prompts. Instead of generating new text, these (vectorizer) models capture the "meaning" of text or other media. We will cover this in more detail later on. - -### Indexing (vector) data - -Vector searches can be very intensive computationally. - -To overcome this problem, Weaviate uses a combination of indexes including an approximate nearest neighbor (ANN) index and an inverted index. The ANN index lets Weaviate perform extremely fast vector searches. The inverted index lets Weaviate filter data using Boolean criteria. - -We will get into this in more detail later - but for now, it's enough to know that Weaviate can perform fast vector searches as well as filtering. - -## Review - -In this section, you learned about what vectors are and how Weaviate utilizes them at a very high level. You have also been introduced to two of Weaviate's key capabilities that help it to enable vector search at speed. - -### Review exercise - -:::tip Self-reflection -Can you describe, in your own words, what vectors are? -::: - - - -### Key takeaways - -- A vector is a series of numbers that capture the meaning or essence of objects. -- Machine learning models help quantify similarity between different objects, which is essential for vector searches. -- Weaviate uses a combination of approximate nearest neighbor (ANN) index and an inverted index to perform fast vector searches with filtering. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const howWeaviateWorks = [{ - questionText: 'Which of these statements are true?', - answerOptions: [ - { - answerText: 'Weaviate has no way of quantifying similarity between objects.', - isCorrect: false, - feedback: 'Weaviate performs vector searches, which is similarity-based.', - }, - { - answerText: 'The only type of index in Weaviate is the vector index.', - isCorrect: false, - feedback: 'In addition to the vector index, Weaviate uses an inverted index.', - }, - { - answerText: 'Weaviate is a machine learning model.', - isCorrect: false, - feedback: 'While Weaviate can be used with a variety of different models which help it determine object similarity, it is itself not a machine learning model. Weaviate is a vector database.', - }, - { - answerText: 'None of the above', - isCorrect: true, - feedback: 'All of these are false!', - }, - ] -}]; diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/20_examples_1.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/20_examples_1.mdx deleted file mode 100644 index 8bdb7db6a..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/20_examples_1.mdx +++ /dev/null @@ -1,226 +0,0 @@ ---- -title: Examples 1 - Queries -description: Hello Weaviate - Examples Part 1 ---- - -## Vectors in action - -import ReactPlayer from 'react-player/lazy' - - -
- -Let's take a look at a few more examples of what you can do with Weaviate. - -First, we will try vector searches by searching through our demo database. You will learn how to use Weaviate to retrieve objects based on their similarity, using various query types such as an input text, vector, or object. - -You will also compare vector search with keyword search to compare and contrast the two techniques, before learning how to combine the two techniques through the use of filters. - -### Vector search demo - -For our first example, let's search our demo dataset. It contains a small sample of questions from the quiz show *Jeopardy!*. - -Imagine that you're running a quiz night, and you want to get some questions about "animals in movies". In a traditional database you could look for word matches, perhaps something like: - -```sql -SELECT question, answer -FROM jeopardy_questions -WHERE ( - question LIKE '%animal%' - OR question LIKE '%creature%' - OR question LIKE '%beast%' - ) -AND ( - question LIKE '%movie%' - OR question LIKE '%film%' - OR question LIKE '%picture%' - OR question LIKE '%cinema%' - ) - -``` - -This is a difficult query to write. Even worse, you would probably have to add the names of specific animals to the query as well. - -The Weaviate query is much more intuitive. See what happens when we run the following query: - -:::note We searched Weaviate for: -animals in movies -::: - -
- See the full query - -```graphql -{ - Get { - JeopardyQuestion ( - nearText: { - concepts: ["animals in movies"] - } - limit: 3 - ) { - question - answer - } - } -} -``` - -
- -Weaviate retrieved these as the top answers: - -:::note Weaviate retrieved: -- **meerkats**: Group of mammals seen here like Timon in *The Lion King* -- **dogs**: Scooby-Doo, Goofy & Pluto are cartoon versions -- **The Call of the Wild Thornberrys**: Jack London story about the dog Buck who joins a Nick cartoon about Eliza, who can talk to animals -::: - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "meerkats", - "question": "Group of mammals seen here: [like Timon in The Lion King]" - }, - { - "answer": "dogs", - "question": "Scooby-Doo, Goofy & Pluto are cartoon versions" - }, - { - "answer": "The Call of the Wild Thornberrys", - "question": "Jack London story about the dog Buck who joins a Nick cartoon about Eliza, who can talk to animals" - } - ] - } - } -} -``` - -
- -Note how relevant the results are, despite none of them including the word "animal" or the word "movie", let alone both! - -This is exactly why vector searches are so useful. They can identify related objects without the need to match exact text. - -### Vector similarities demo - -If we run *this* query, you might expect to see responses like the ones we saw earlier. - -```graphql -{ - Get { - JeopardyQuestion ( - nearText: { - concepts: ["European geography"] - } - limit: 3 - ) { - question - answer - _additional { - distance - } - } - } -} -``` - -But, take a look at this response. Do you notice any additional information? - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.15916324 - }, - "answer": "Bulgaria", - "question": "A European republic: Sofia" - }, - ... - ] - } - } -} - -``` - -
- See the full JSON response from Weaviate - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.15916324 - }, - "answer": "Bulgaria", - "question": "A European republic: Sofia" - }, - { - "_additional": { - "distance": 0.16247147 - }, - "answer": "Balkan Peninsula", - "question": "The European part of Turkey lies entirely on this peninsula" - }, - { - "_additional": { - "distance": 0.16832423 - }, - "answer": "Mediterranean Sea", - "question": "It's the only body of water with shores on the continents of Asia, Africa & Europe" - } - ] - } - } -} - -``` - -
- -The difference is that the response contains a `distance` value. - -A `distance` is indicative of the degree of similarity between the returned object and the query. - -If you're wondering exactly what that means, and who decides how similar any two objects or concepts are, those are great questions! We will cover those in more detail later. - -For now, just keep in mind that smaller distances mean two objects are more similar to each other. - -## Review - - - - - -### Key takeaways - -- Vector searches can identify related objects without the need for exact text matches. -- In vector searches, distance values indicate the degree of similarity between the returned object and the query. -- Smaller distances indicate greater similarity. -- Vector searches can be combined with keyword searches and filtering techniques for more refined search results. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/25_examples_2.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/25_examples_2.mdx deleted file mode 100644 index 452ef18f0..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/25_examples_2.mdx +++ /dev/null @@ -1,146 +0,0 @@ ---- -title: Examples 2 - More than search -description: Hello Weaviate - Examples Part 2 -sidebar_position: 25 ---- - -## Beyond vector searches - -import ReactPlayer from 'react-player/lazy' - - -
- -You can do a lot more with Weaviate than simply retrieve static information. - -Let's take a look at a couple of examples where we do more than simply retrieve objects from the database. - -We will extract information from this Wikipedia entry. - -
- "The Sydney Opera House" Wikipedia summary - -The Sydney Opera House is a multi-venue performing arts centre in Sydney. Located on the foreshore of Sydney Harbour, it is widely regarded as one of the world's most famous and distinctive buildings and a masterpiece of 20th-century architecture. Designed by Danish architect Jørn Utzon, but completed by an Australian architectural team headed by Peter Hall, the building was formally opened by Queen Elizabeth II on 20 October 1973 after a gestation beginning with Utzon's 1957 selection as winner of an international design competition. The Government of New South Wales, led by the premier, Joseph Cahill, authorised work to begin in 1958 with Utzon directing construction. The government's decision to build Utzon's design is often overshadowed by circumstances that followed, including cost and scheduling overruns as well as the architect's ultimate resignation. The building and its surrounds occupy the whole of Bennelong Point on Sydney Harbour, between Sydney Cove and Farm Cove, adjacent to the Sydney central business district and the Royal Botanic Gardens, and near to the Sydney Harbour Bridge. - -The building comprises multiple performance venues, which together host well over 1,500 performances annually, attended by more than 1.2 million people. Performances are presented by numerous performing artists, including three resident companies: Opera Australia, the Sydney Theatre Company and the Sydney Symphony Orchestra. As one of the most popular visitor attractions in Australia, the site is visited by more than eight million people annually, and approximately 350,000 visitors take a guided tour of the building each year. The building is managed by the Sydney Opera House Trust, an agency of the New South Wales State Government. - -On 28 June 2007, the Sydney Opera House became a UNESCO World Heritage Site, having been listed on the (now defunct) Register of the National Estate since 1980, the National Trust of Australia register since 1983, the City of Sydney Heritage Inventory since 2000, the New South Wales State Heritage Register since 2003, and the Australian National Heritage List since 2005. The Opera House was also a finalist in the New7Wonders of the World campaign list. - -
- -Weaviate creates [data objects](/weaviate/concepts/data) when it processes the Wikipedia entry. The data objects are stored in classes. A class is roughly analogous to a table in a relational database. An object is similar to an entry in that table. - -### Question-answering demo - -Weaviate can extract knowledge from the text. - -```graphql -{ - Get { - WikiArticle ( - ask: { - question: "When did construction for the Sydney Opera House start?", - properties: ["wiki_summary"] - }, - limit: 1 - ) { - title - _additional { - answer { - hasAnswer - property - result - startPosition - endPosition - } - } - } - } -} -``` - -Given this query, Weaviate doesn't just identify the most relevant data object. Weaviate also answers the question based on the textual information in the Wikipedia article. - -
- See response - -:::note Weaviate says: -Construction for the Sydney Opera House started in 1958. -::: - -
- -### Generative search - -Weaviate can do even more with these entries. You can ask Weaviate to grab an object from its data store and use that object to generate new text. For example, Weaviate can use the object that contains the entry for the Sydney Opera House to derive new text. - -```graphql -{ - Get { - WikiArticle( - nearText: { - concepts: ["Sydney Opera House"] - } - limit: 1 - ) { - title - wiki_summary - _additional { - generate( - singleResult: { - prompt: """ - Write a fun tweet encouraging people to read about this: ## {title} - by summarizing highlights from: ## {wiki_summary} - """ - } - ) { - singleResult - error - } - } - } - } -} -``` - -The sample code generates a Tweet based on the Wikipedia entry! - -
- See response - -:::note Weaviate says: -Explore the world-famous Sydney Opera House and its incredible architecture! From the iconic design to the amazing performances, there's something for everyone to enjoy. #SydneyOperaHouse #Explore #Architecture #Performances #Experience -::: - -
- -This process is an example of `generative search`. In a generative search, Weaviate retrieves information, and then leverages a large language model (LLM) to re-shape it. This is a powerful feature that can transform how you deal with information. - -You can vary the prompt to generate different results. - -### What next? - -Tools like Q&A and generative search really start to bring your information to life. In the next sections, you will set up Weaviate and run your own queries. - -## Review - - - -### Key takeaways - -- Weaviate can extract knowledge from text using question-answering capabilities, identifying the most relevant object and the actual answer based on the provided text. -- Generative search allows you to retrieve information and reshape or repurpose the content, such as generating a tweet based on a Wikipedia entry. -- These advanced capabilities of Weaviate transform how you interact with and utilize information in your data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/40_set_up.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/40_set_up.mdx deleted file mode 100644 index 7b30e7c02..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/40_set_up.mdx +++ /dev/null @@ -1,212 +0,0 @@ ---- -title: Database & client setup -description: Learn to set up Weaviate for quick and efficient data search. -sidebar_position: 40 ---- - -import registerImg from '../../../../cloud/img/register.png'; -import WCDcreate from '../../../../cloud/img/wcs-create.png'; -import WCDcreationProgress from '../../../../cloud/img/wcs-creation-progress.png'; - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -## Options for running Weaviate - -[Weaviate Cloud (WCD)](https://console.weaviate.cloud/) is a managed service that runs Weaviate for you. You can also manage your own Weaviate instances using Docker, Kubernetes, or Embedded Weaviate. - -No matter how you run Weaviate, the underlying code is identical. However, there are some operational differences to be aware of. - -### WCD (Recommended) - -[Weaviate Cloud (WCD)](https://console.weaviate.cloud/), is a managed SaaS service that requires no maintenance at your end. - -As it is managed by Weaviate (the company - the software is not sentient... *yet*). WCD is the fastest way to create a new instance of Weaviate and requires the least amount of effort for users. - -WCD has Weaviate instances that are pre-configured for your convenience. They include a number of Weaviate modules by default. They also have built-in support for user authentication. - -WCD includes a free "sandbox" tier. The WCD sandbox is our recommended method of running Weaviate in this course. - -### Docker and Kubernetes - -You can run Weaviate instances using containerization solutions such as [Docker](https://docs.docker.com/) and [Kubernetes](https://kubernetes.io/docs/home/). - -Running your own instance provides you with the same Weaviate code base as a WCD instance, but you have to manage configuration and deployment yourself. - -This course doesn't cover self-managed instances. We will cover Docker and Kubernetes in separate course units in the future. - -If you are familiar with either solution and want to use them to install Weaviate now, see the documentation for [Docker-Compose](/deploy/installation-guides/docker-installation.md) or [Kubernetes](/deploy/installation-guides/k8s-installation.md). - -### Embedded Weaviate - -We also have an experimental feature called [Embedded Weaviate](/deploy/installation-guides/embedded.md), where you can directly instantiate a Weaviate database from a client library. - -Currently, Embedded Weaviate is only recommended for evaluation purposes. - -## Get started with WCD - -### Sign in to WCD - -1. To access WCD, go to the [Weaviate Cloud Console](https://console.weaviate.cloud) - -1. If you don't have a WCD account, click on the "Register" button to create a new account. - -1. Click on "Sign in with the Weaviate Cloud" and sign in with your WCD username and password. - -### Create a Weaviate Cluster - -To create a new Weaviate Cluster, click the "Create cluster" button. - -On the Create a Cluster page: - -1. Select the **Free sandbox** plan tier. -1. Provide a *cluster name*. The sandbox URL is based on the cluster-name. WCD adds a suffix to ensure uniqueness. -1. Set the `Enable Authentication?` option to `YES`. -1. Press **Create** to create your sandbox instance. Note that the sandbox will expire after a set number of days. - -Create instance - -This starts the process to create a new instance. WCD displays a progress indicator while the sandbox builds. - -Creation in progress - -Instance creation takes a minute or two. WCD displays a checkmark (✔️) next to your sandbox when the instance is ready. - -
- Sandbox expiration & options - -import SandBoxExpiry from '/_includes/sandbox.expiry.mdx'; - - - -
- -There are several ways to work with your sandbox. - -- [Client libraries](/weaviate/client-libraries/index.mdx) -- [RESTful API](/weaviate/api/rest) -- [GraphQL API](/weaviate/api/index.mdx) - -## Install Weaviate client - -:::info Academy material in Python -For the initial release of Weaviate Academy units, our materials are written around Python examples.

-We are working to add examples for other client languages, starting with TypeScript. We appreciate your patience as we build up our educational material. -::: - -### Available clients - -Currently, Weaviate clients are available in: - -- Python -- TypeScript -- Java -- Go - -### Client capabilities - -import ClientCapabilitiesOverview from '/_includes/client.capabilities.mdx' - - - -### Installation - -Install your preferred client by following the relevant instructions below: - -import CodeClientInstall from '/_includes/code/quickstart/clients.install.mdx'; - - - -## Review - -### Review exercise - - - - - - - -### Key takeaways - -- There are multiple ways to run Weaviate. -- The recommended, easiest way to run a Weaviate instance is with WCD. -- Weaviate clients are available in multiple languages. -- Currently, the Academy material is available in Python only. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const instanceOptions = [{ - questionText: 'Which of the following is not true?', - answerOptions: [ - { - answerText: 'We recommend using WCD for Weaviate Academy.', - isCorrect: false, - feedback: 'WCD *is* our recommended option for running Weaviate.', - }, - { - answerText: 'The newest versions of Weaviate are available for WCD and locally hosted instances.', - isCorrect: false, - feedback: 'You can run the latest version of Weaviate anywhere, on Docker/Kubernetes or WCD.', - }, - { - answerText: 'This unit will cover Docker/Kubernetes deployment.', - isCorrect: true, - feedback: 'That topic will be discussed in a later unit.', - }, - ] -}]; -export const wcdSetup = [{ - questionText: 'Which of the following is necessary to configure a Weaviate instance for Weaviate Academy exercises?', - answerOptions: [ - { - answerText: 'A paid instance of WCD.', - isCorrect: false, - feedback: 'A free (sandbox) tier is sufficient for Weaviate Academy.', - }, - { - answerText: 'OpenID Connect (OIDC) authentication.', - isCorrect: false, - feedback: 'You are welcome to use OIDC, but it is not necessary.', - }, - { - answerText: 'A self-hosted Docker or Kubernetes instance.', - isCorrect: false, - feedback: 'You are welcome to use a self-hosted instance, but it is not necessary.', - }, - { - answerText: 'None of the above are necessary.', - isCorrect: true, - feedback: 'Looks like you are ready to move on 😊.', - }, - ] -}]; -export const clientCapabilities = [{ - questionText: 'Which of the following is not true about Weaviate clients?', - answerOptions: [ - { - answerText: 'Weaviate clients are available for Python, TypeScript/JavaScript, Go and Java.', - isCorrect: false, - feedback: 'Clients are currently available for each of these languages.', - }, - { - answerText: 'There is only a small subset of GraphQL queries that Weaviate clients cannot perform.', - isCorrect: false, - feedback: 'These clients can perform all RESTful and GraphQL requests.', - }, - { - answerText: 'Weaviate clients come bundled with Weaviate.', - isCorrect: true, - feedback: 'The appropriate client must be installed separately for each language.', - }, - ] -}]; diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/50_hands_on.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/50_hands_on.mdx deleted file mode 100644 index 5f2c653e6..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/50_hands_on.mdx +++ /dev/null @@ -1,281 +0,0 @@ ---- -title: Getting hands-on ---- - -### Preparation - -:::note Obtain a free trial OpenAI API key -This section includes queries using the OpenAI inference endpoint. If you don't already have an OpenAI account, we recommend creating one. At the time of writing, OpenAI provides trial credits which should be sufficient for these exercises. -::: - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -Now that you've set up your own Weaviate instance and installed a client, let's get hands-on with Weaviate. - -### Client instantiation - -Create a `client` object for persistent access to your Weaviate instance. You can set the following parameters: - -- **Host URL (required)** - - This is the location of your Weaviate instance. The URL resembles - - ``` - https://your-sandbox-uo2vgb1z.weaviate.network - ``` - -- **Authentication details (optional)** - - If authentication is enabled, you MUST provide your authentication information here. Otherwise the Weaviate instance will not provide access. - - ``` - AbCdEfGhIjKlMnOpAXB6gbBIaxHDuuwWj5OD - ``` -- **Additional headers (optional)** - - You can provide additional headers. These headers provide API keys for inference services such as Cohere, Hugging Face or OpenAI. - -A fully configured client resembles this sample code, edited to match your Weaviate instance. - - - -import ClientInstantiationCode from './_snippets/academy.hello.client.instantiation.mdx'; - - - -### Try it out! - -Now, connect to your Weaviate instance. - -1. Copy the client code below to a file called `connection_example.py`. -1. Edit the file to use your Weaviate URL -1. Edit the API key to use the key for your sandbox. - - You can find the API keys for your sandbox by clicking the "Details" button and looking for "API keys" in the Authentication tab. The key resembles this one. -1. Run the file. - - ``` - python3 connection_example.py - ``` - -import ClientCheckConnection from './_snippets/academy.hello.check.connection.mdx'; - - - -Congratulations! You've made your first request to a Weaviate API! The `meta` REST endpoint returns configuration details for your instance. - -The Weaviate API allows you to do quite a bit. We will try out some examples in the next sections. For more details, see the [API documentation](/weaviate/api/index.mdx). - -## Weaviate API and the client - -### Available APIs - -Weaviate uses two API types - REST APIs and GraphQL APIs. They work together to provide a rich set of functionality. - -:::tip REST APIs vs GraphQL APIs -The **REST API** provides: -- CRUD (Create, Read, Update and Delete) operations -- Metadata about the database - -The **GraphQL API** provides: -- Data searches -- Data object retrieval -- Information aggregation -- Vector space exploration -::: - -You will learn about these capabilities over the course of these units. - -
- What is REST? - -REST is an acronym for **RE**presentational **S**tate **T**ransfer. - -A REST API provides multiple endpoints, each with its own URL, that can be used to interact with the application. - -The endpoints are organized into a hierarchy, with each endpoint representing a resource. The client can then request information about these resources by sending a request to the server. - -
- -
- What is GraphQL? - -GraphQL is a query language for APIs. - -First released by Facebook in 2015, it is now maintained by the GraphQL Foundation. - -GraphQL is a specification for a query language that can be used to request information from a server. GraphQL is a **strongly typed** language, which means that the client must specify the type of data that it wants to receive. - -
- -### Access the REST API - -You can access the REST API and the GraphQL API with the Weaviate client libraries, or with other tools as long as the tool formats the request properly. - -In these examples, the code shown in each set of tabs is functionally the same. Both REST calls request meta information. Both GraphQL calls ask for the same data. - -#### Example 1: REST vs client requests - -import ClientRESTObjectGet from './_snippets/academy.clients.rest.object.get.mdx'; - - - -#### Example 2: GraphQL vs client requests - -import ClientGraphqlGet from './_snippets/academy.clients.graphql.get.mdx'; - - - -Now, let's try out more substantive queries. - -## Running queries - - - -### Connect to our demo instance - -Now let's try to connect to a Weaviate demo instance and run some queries on the sample data. The instance has these details: - -:::info Demo instance details -- `url`: `https://edu-demo.weaviate.network` -- `Weaviate API key`: `readonly-demo` -::: - -Use these instance details and see if you can: -- Instantiate a Weaviate client -- Check that the connection by fetching the metadata as we did above. - -Bonus points if you can do it without looking at the snippet below: - -import ClientCheckConnectionEdudemo from './_snippets/academy.hello.check.connection.edudemo.mdx'; - -
- Connect to the demo instance - - - -
- -### Vector searches - -The next query searches the `WikiCity` objects for the cities that are closest to the specified text, which in this case is simply, "Major European city". - -To run this query, update the connection file you just created. - -- Comment out any lines that request meta information -- Add the OpenAI authorization key you created earlier -- Add the following code snippet - -import QueryVectorSearch from './_snippets/academy.hello.query.vector.search.mdx'; - - - -:::tip Exercise -Try varying the query concept from "Major European city" to another - what do you see? Is it in line with what you expected? -::: - -### Question answering - -This example searches the `WikiCity` objects to answer the question, "When was the London Olympics?" Update your code from the last example, and try it out yourself. - -import QueryQuestionAnswering from './_snippets/academy.hello.query.question.answering.mdx'; - - - -:::tip Exercise -Try varying the question from, "When was the London Olympics?" to another, city-related, question. What do you see? - -Try to see what types of questions work better than others. Do you notice any patterns? -::: - -### Generative search - -This example also searches the `WikiCity` objects, but this one uses the Weaviate `generative-openai` module to transform the results. In this case, the module produces tweets about cities in Southeast Asia. - -Update your demo code once again, and try it out: - -import QueryGenerativeSearch from './_snippets/academy.hello.query.generative.search.mdx'; - - - -:::tip Exercise -Try varying the prompt from: - -``` -"Write a tweet with a potentially surprising fact from {wiki_summary}" -``` - -What happens if you remove \{wiki_summary}? -::: - -## Review - -### Review exercise - - - - - -### Key takeaways - -- Weaviate uses two API types, REST and GraphQL. REST is used for CRUD operations and metadata, while GraphQL is used for data searches, retrieving data objects, and exploring vector spaces. -- Client libraries are used to access both REST and GraphQL APIs, providing a convenient way to interact with Weaviate instances. -- You have connected to a demo Weaviate instance to run vector searches, question-answering queries, and generative searches. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const apiRecap = [{ - questionText: 'Which of the following is not true about the Weaviate API?', - answerOptions: [ - { - answerText: 'Weaviate users can use both REST and GraphQL.', - isCorrect: false, - feedback: 'They are both available for all users, and serve complementary roles in communicating with Weaviate.', - }, - { - answerText: 'The REST API can be used to retrieve instance configuration data.', - isCorrect: false, - feedback: 'the `meta` endpoint is available for this purpose.', - }, - { - answerText: 'Both GraphQL and REST APIs can be used in Weaviate to perform vector searches.', - isCorrect: true, - feedback: 'Only the GraphQL API performs vector searches.', - }, - { - answerText: 'None of the above.', - isCorrect: false, - feedback: 'The truth is out there!', - }, - ] -}]; -export const clientLimits = [{ - questionText: 'What can\'t Weaviate clients do?', - answerOptions: [ - { - answerText: 'Analyze the retrieved results.', - isCorrect: true, - feedback: 'They cannot perform any data analysis.', - }, - { - answerText: 'Communicate with the Weaviate REST API.', - isCorrect: false, - feedback: 'They can all perform REST API requests.', - }, - { - answerText: 'Communicate with the Weaviate GraphQL API.', - isCorrect: false, - feedback: 'They can all perform GraphQL API requests.', - }, - ] -}]; diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/90_wrap_up.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/90_wrap_up.mdx deleted file mode 100644 index c6572259e..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/90_wrap_up.mdx +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: Wrap-up -description: Wrap-up on Hello Weaviate -sidebar_position: 90 ---- - -## Unit review - -In this unit, we aimed to provide you with an overview of Weaviate. - -We did this by covering what Weaviate is, and what it can do, before discussing what vector database and vector search are before going on to run Weaviate and perform vector searches yourself. - -Now, you should have a foundation of knowledge from which we can begin to learn more about Weaviate, including more details on how to build a database and perform queries. Before long, you will be creating your own projects using Weaviate. - -### Learning outcomes - -Having finished this unit, you should be able to: -- Broadly describe what Weaviate is. -- Outline what vector search is. -- Create a Weaviate instance on WCD. -- Install your preferred Weaviate client (Python for Weaviate Academy). -- Describe some of Weaviate's capabilities. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.clients.graphql.get.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.clients.graphql.get.mdx deleted file mode 100644 index 34a590b62..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.clients.graphql.get.mdx +++ /dev/null @@ -1,96 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```graphql -{ - Get { - WikiArticle { - title - wiki_summary - } - } -} -``` - - - - -```python -result = client.query.get("WikiArticle", ["title", "wiki_summary"]).do() -``` - - - - -```go -package main - -import ( - "context" - "fmt" - - "github.com/weaviate/weaviate-go-client/v5/weaviate" - "github.com/weaviate/weaviate-go-client/v5/weaviate/graphql" -) - -func main() { - cfg := weaviate.Config{ - Host: "WEAVIATE_INSTANCE_URL", // Replace WEAVIATE_INSTANCE_URL with your instance URL - Scheme: "http", - } - client := weaviate.New(cfg) - fields := []graphql.Field{ - {Name: "title"}, - {Name: "wiki_summary"}, - } - ctx := context.Background() - result, err := client.GraphQL().Get(). - WithClassName("WikiArticle"). - WithFields(fields...). - Do(ctx) - if err != nil { - panic(err) - } - fmt.Printf("%v", result) -} -``` - - - - -```java -package technology.semi.weaviate; - -import technology.semi.weaviate.client.Config; -import technology.semi.weaviate.client.WeaviateClient; -import technology.semi.weaviate.client.base.Result; -import technology.semi.weaviate.client.v1.graphql.model.GraphQLResponse; -import technology.semi.weaviate.client.v1.graphql.query.fields.Field; - -public class App { - public static void main(String[] args) { - Config config = new Config("https", "WEAVIATE_INSTANCE_URL"); - // Replace WEAVIATE_INSTANCE_URL with your instance URL - WeaviateClient client = new WeaviateClient(config); - - Field title = Field.builder().name("title").build(); - Field url = Field.builder().name("wiki_summary").build(); - - Result result = client.graphQL().get() - .withClassName("WikiArticle") - .withFields(title, url, wordCount) - .run(); - if (result.hasErrors()) { - System.out.println(result.getError()); - return; - } - System.out.println(result.getResult()); - } -} -``` - - - diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.clients.rest.object.get.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.clients.rest.object.get.mdx deleted file mode 100644 index db47ee32d..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.clients.rest.object.get.mdx +++ /dev/null @@ -1,87 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```shell -curl http://localhost:8080/v1/meta -``` - - - - -```python -import weaviate - -client = weaviate.Client("http://localhost:8080") - -print(client.get_meta()) -``` - - - - -```go -package main - -import ( - "context" - "fmt" - - "github.com/weaviate/weaviate-go-client/v5/weaviate" -) - -func main() { - cfg := weaviate.Config{ - Host: "WEAVIATE_INSTANCE_URL", - Scheme: "https", - } - client := weaviate.New(cfg) - - data, err := client.Data().ObjectsGetter(). - WithClassName("WikiArticle"). - WithID("36ddd591-2dee-4e7e-a3cc-eb86d30a4303"). - Do(context.Background()) - - if err != nil { - panic(err) - } - fmt.Printf("%v", data) -} -``` - - - - -```java -package technology.semi.weaviate; - -import java.util.List; -import technology.semi.weaviate.client.Config; -import technology.semi.weaviate.client.WeaviateClient; -import technology.semi.weaviate.client.base.Result; -import technology.semi.weaviate.client.v1.data.model.WeaviateObject; - -public class App { - public static void main(String[] args) { - Config config = new Config("https", "WEAVIATE_INSTANCE_URL"); - // Replace WEAVIATE_INSTANCE_URL with your instance URL - WeaviateClient client = new WeaviateClient(config); - - Result> result = client.data().objectsGetter() - .withClassName("WikiArticle") - .withID("36ddd591-2dee-4e7e-a3cc-eb86d30a4303") - .run(); - - if (result.hasErrors()) { - System.out.println(result.getError()); - return; - } - System.out.println(result.getResult()); - } -} -``` - - - diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.check.connection.edudemo.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.check.connection.edudemo.mdx deleted file mode 100644 index f1f23bc17..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.check.connection.edudemo.mdx +++ /dev/null @@ -1,62 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -import weaviate -import json - -auth_config = weaviate.auth.AuthApiKey(api_key="readonly-demo") - -# Instantiate the client -client = weaviate.Client( - url="https://edu-demo.weaviate.network", - auth_client_secret=auth_config, - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", # Replace with your OpenAI key - } -) - -meta_info = client.get_meta() -print(json.dumps(meta_info, indent=2)) -``` - - - - -
- Expected output - -```json -{ - "hostname": "http://[::]:8080", - "modules": { - "generative-openai": { - "documentationHref": "https://beta.openai.com/docs/api-reference/completions", - "name": "Generative Search - OpenAI" - }, - "qna-openai": { - "documentationHref": "https://beta.openai.com/docs/api-reference/completions", - "name": "OpenAI Question & Answering Module" - }, - "ref2vec-centroid": {}, - "text2vec-cohere": { - "documentationHref": "https://docs.cohere.com/docs/embeddings", - "name": "Cohere Module" - }, - "text2vec-huggingface": { - "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task", - "name": "Hugging Face Module" - }, - "text2vec-openai": { - "documentationHref": "https://beta.openai.com/docs/guides/embeddings/what-are-embeddings", - "name": "OpenAI Module" - } - }, - "version": "1.18.2" -} -``` - -
\ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.check.connection.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.check.connection.mdx deleted file mode 100644 index 52fa4f803..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.check.connection.mdx +++ /dev/null @@ -1,59 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -import weaviate -import json - -auth_config = weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY") # Replace with your Weaviate API key - -# Instantiate the client -client = weaviate.Client( - url="https://your-endpoint.weaviate.network", # Replace with your sandbox URL - auth_client_secret=auth_config, -) - -meta_info = client.get_meta() -print(json.dumps(meta_info, indent=2)) -``` - - - - -
- Expected output - -```json -{ - "hostname": "http://[::]:8080", - "modules": { - "generative-openai": { - "documentationHref": "https://beta.openai.com/docs/api-reference/completions", - "name": "Generative Search - OpenAI" - }, - "qna-openai": { - "documentationHref": "https://beta.openai.com/docs/api-reference/completions", - "name": "OpenAI Question & Answering Module" - }, - "ref2vec-centroid": {}, - "text2vec-cohere": { - "documentationHref": "https://docs.cohere.com/docs/embeddings", - "name": "Cohere Module" - }, - "text2vec-huggingface": { - "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task", - "name": "Hugging Face Module" - }, - "text2vec-openai": { - "documentationHref": "https://beta.openai.com/docs/guides/embeddings/what-are-embeddings", - "name": "OpenAI Module" - } - }, - "version": "1.18.2" -} -``` - -
\ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.client.instantiation.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.client.instantiation.mdx deleted file mode 100644 index f06a773d9..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.client.instantiation.mdx +++ /dev/null @@ -1,26 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -import weaviate - -# Only if authentication enabled; assuming API key authentication -auth_config = weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY") # Replace with your Weaviate instance API key - -# Instantiate the client -client = weaviate.Client( - url="https://example.weaviate.network", - auth_client_secret=auth_config, # Only necessary if authentication enabled - additional_headers={ - "X-Cohere-Api-Key": "YOUR-COHERE-API-KEY", # Replace with your Cohere key - "X-HuggingFace-Api-Key": "YOUR-HUGGINGFACE-API-KEY", # Replace with your Hugging Face key - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", # Replace with your OpenAI key - } -) -``` - - - diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.generative.search.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.generative.search.mdx deleted file mode 100644 index 8a59bbc7a..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.generative.search.mdx +++ /dev/null @@ -1,48 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -res = client.query.get( - "WikiCity", ["city_name", "wiki_summary"] -).with_near_text({ - "concepts": ["Popular Southeast Asian tourist destination"] -}).with_limit(3).with_generate( - single_prompt=\ - "Write a tweet with a potentially surprising fact from {wiki_summary}" -).do() - -for city_result in res["data"]["Get"]["WikiCity"]: - print(json.dumps(city_result["_additional"], indent=2)) -``` - - - - -
- Expected output - -```json -{ - "generate": { - "error": null, - "singleResult": " #FunFact: Bangkok is the world's most visited city, with over 22 million visitors in 2019! #Bangkok #Thailand #Travel" - } -} -{ - "generate": { - "error": null, - "singleResult": "Did you know that Ho Chi Minh City is home to many multinational companies and generates nearly a quarter of Vietnam's total GDP? #HCMC #Vietnam #Economy" - } -} -{ - "generate": { - "error": null, - "singleResult": "Surprising fact: Singapore is the only country in Asia with a AAA sovereign credit rating from all major rating agencies. #Singapore #AAA #CreditRating" - } -} -``` - -
\ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.question.answering.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.question.answering.mdx deleted file mode 100644 index f5d3bc889..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.question.answering.mdx +++ /dev/null @@ -1,54 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -ask = { - "question": "When was the London Olympics?", - "properties": ["wiki_summary"] -} - -res = ( - client.query - .get("WikiCity", [ - "city_name", - "_additional {answer {hasAnswer property result} }" - ]) - .with_ask(ask) - .with_limit(1) - .do() -) - -print(json.dumps(res, indent=2)) -``` - - - - -
- Expected output - -```json -{ - "data": { - "Get": { - "WikiCity": [ - { - "_additional": { - "answer": { - "hasAnswer": true, - "property": "wiki_summary", - "result": " 2012" - } - }, - "city_name": "London" - } - ] - } - } -} -``` - -
\ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.vector.search.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.vector.search.mdx deleted file mode 100644 index 312ec9d69..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/_snippets/academy.hello.query.vector.search.mdx +++ /dev/null @@ -1,64 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -res = client.query.get( - "WikiCity", ["city_name", "country", "lng", "lat"] -).with_near_text({ - "concepts": ["Major European city"] -}).with_limit(5).do() - -print(json.dumps(res, indent=2)) -``` - - - - -
- Expected output - -```json -{ - "data": { - "Get": { - "WikiCity": [ - { - "city_name": "Paris", - "country": "France", - "lat": 48.8566, - "lng": 2.3522 - }, - { - "city_name": "London", - "country": "United Kingdom", - "lat": 51.5072, - "lng": -0.1275 - }, - { - "city_name": "Madrid", - "country": "Spain", - "lat": 40.4167, - "lng": -3.7167 - }, - { - "city_name": "Berlin", - "country": "Germany", - "lat": 52.5167, - "lng": 13.3833 - }, - { - "city_name": "Budapest", - "country": "Hungary", - "lat": 47.4983, - "lng": 19.0408 - } - ] - } - } -} -``` - -
\ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/101_hello_weaviate/index.mdx b/docs/academy/py/zero_to_mvp/101_hello_weaviate/index.mdx deleted file mode 100644 index 4c3fce6ca..000000000 --- a/docs/academy/py/zero_to_mvp/101_hello_weaviate/index.mdx +++ /dev/null @@ -1,46 +0,0 @@ ---- -title: P3_101 Hello, Weaviate -description: "Get started with Weaviate: an introductory guide to set up and query data." -sidebar_position: 101 # Like a subject number (e.g. CS101) ---- - -import ZeroToMvpDeprecationNote from '/docs/academy/py/zero_to_mvp/_snippets/deprecation.md'; - - - -## Unit overview - -import ReactPlayer from 'react-player/lazy' - - -
- - - -Welcome! - -This is the beginning of your journey through the world of vectors with Weaviate. This unit will provide you with an overview of the fundamentals of Weaviate. - -You'll first gain an understanding of what Weaviate is, and what it can do. You will then learn about what vector database and vector search are before going on to run Weaviate and perform vector searches yourself. - -By the end of this unit, you will have a strong foundation of knowledge that will help you to effectively navigate the rest of the course, and for using Weaviate in your own projects. - -### Prerequisites - -- None - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/10_get.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/10_get.mdx deleted file mode 100644 index ef2d37850..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/10_get.mdx +++ /dev/null @@ -1,298 +0,0 @@ ---- -title: Get{} objects ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/10_get.py'; - -:::info Make sure you complete Weaviate Academy Preparation -Make sure to complete the [Weaviate Academy Preparation](../setup.mdx) mini-unit before starting this unit to make sure that you can run the client library and connect to the demo Weaviate instance without issues. -


- -Below, you will see code snippets that do not include client instantiation details. Before running these snippets, make sure to instantiate the client as shown below. - - - - - - -::: - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -## About `Get` queries - -In many use cases, retrieving objects from a Weaviate instance may be the most common operation. - -For example, a user may want to retrieve a list of passages most closely related to the input query, or they may wish to retrieve a list of images which are most similar to another image. It is even possible to retrieve a set of images that best match a given passage. - -In Weaviate, such operations to retrieve objects are performed using the `Get` function. - -### `Get` function syntax - -A basic `Get` function looks as follows: - -import GetSyntax from './_snippets/academy.queries.get.syntax.mdx'; - - - -- The `Class` field specifies the name of the class of objects to be retrieved. -- The `arguments` argument specifies the search criteria to be used to retrieve the objects. -- The `properties` argument specifies the properties of the objects to be retrieved, including any `_additional` properties. - -Now let's try out some concrete `Get` queries. - -## Standalone `Get` queries - -A basic, standalone, `Get` query might look as follows: - -### Example - -import GetStandalone from './_snippets/academy.queries.get.standalone.mdx'; - - - - - - - - - - - - -What results do you expect? See if you can correspond each field to the syntax. - -Now, try it out yourself. The query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "Amazon.com", - "question": "On July 16, 1995 this company made its first sale, a science textbook" - }, - { - "answer": "Parfait", - "question": "The French name of this layered ice cream dessert means \"perfect\"" - } - ] - } - } -} -``` - -
- -### Response object from Weaviate - -As you see above, the response object from Weaviate is in JSON format, where the results are obtained via the `data` field. It is then arranged in a corresponding manner to reflect the query as well as the queried object. - -In the above example, the `Get` field reflects the query function, and the `JeopardyQuestion` field reflects the queried object class, containing returned objects. Each object contains `answer` and `question` fields, reflecting the requested properties. - -
- Explain this query - -In this case, Weaviate will return two objects due to the `.with_limit(2)` argument. Without this limit, Weaviate would return a maximum number according to its configuration. - -:::tip Is this query useful? -As this does not apply any user-specific criteria to the search, the specific results will likely not be very useful. - -However, this may be a viable method for "sanity" checks, such as checking that you can connect to a Weaviate instance, or that at least some objects have been imported successfully. -::: - -
- -### `Class` and `properties` - -In the above example, we specify a `Class` of `JeopardyQuestion` and `properties` of `questions` and `answer`. - -That is possible because those follow the structure of our data in Weaviate. To see the available data classes and properties, you can take a look at the Weaviate schema, as shown below: - -
- How do I see the schema? - -You can fetch the schema like this - try it out! - -```python -client.schema.get() -``` - -
- -import ShortSchema from './_snippets/academy.queries.schema.short.mdx'; - - - -The `Class` and `properties` fields must correspond to collections of objects that have been defined in the Weaviate schema. - -The `Class` must be the name of a data object collection, and the `properties` a list of properties to be retrieved. - -The schema contains the `JeopardyQuestion` class, with properties: `question`, `answer` and `points`. - -So, a query retrieving objects from the `Question` class could specify any of its properties such as `question`, `answer`, and `points`. - -:::note Exercise -Try out the above query again, with these changes. -- Can you get the `points` property as well? -- What happens if you don't specify any properties? -::: - -:::info Can I search multiple classes at once? -No. You can only search one class at a time. -


- -This is because each class constitutes a single `vector space`. If you want to search multiple collections of objects, you will have to perform multiple searches, or consider putting them into one class and using a filter to distinguish between them as required. -


- -We will consider this topic in more detail in a later unit, including what it means for each class to constitute a distinct `vector space`, and how to think about building a schema in Weaviate. -::: - -## `Get` with `additional` properties - -You can retrieve additional properties that are not defined in the schema. These properties may be inherent to the object, or relate to the query performed. - -### Example - -In this example, we've built on the previous example to add the `.with_additional` method. - -import GetWithAdditional from './_snippets/academy.queries.get.with.additional.mdx'; - - - -Again, consider what the response might look like. What would have changed in the response? - -Now, try it out yourself. The query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.1791926, - "id": "b1645a32-0c22-5814-8f35-58f142eadf7e" - }, - "answer": "escaping the Earth's gravity (and go off into outer space, on your way to the moon, for instance)", - "question": "It takes approximately 24,840 MPH to achieve this" - }, - { - "_additional": { - "distance": 0.18123823, - "id": "ef263438-b152-5540-97f7-99f4076bd124" - }, - "answer": "the Milky Way", - "question": "This is the name of our own galaxy" - } - ] - } - } -} -``` - -
- -### Response object from Weaviate - -In this response, you see that the `_additional` field has been added to the response object, with `distance` and `id` nested under it. - -
- Explain this query - -Here, the `distance` and `id` properties contain the object's distance to the query and its unique ID respectively. - -
- -### `_additonal` properties - -As the name suggests, `_additional` properties are separate to those explicitly created as class properties. - -The above query included the `distance` property in the `_additional` field. This represents the degree of similarity (or, in this case, the *dissimilarity*) between the input vector and the vector of the object. - -:::info Vectors available through `_additional` - -The object vector can also be retrieved through an `_additional` field, by specifying `vector` in its sub-field. - -Note that the returned vector in many cases will be a very long list of numbers. - -::: - -:::note Exercise -Try out the above query again, with these changes. -- Can you get the `vector` property as well? -- Earlier, providing an empty list as the main properties under the `.get()` method will have caused an error. What happens if you try it again, now that you are requesting additional properties? -::: - -## Review - -### Review exercise - -See in-line exercises. - -### Key takeaways - -- The 'Get' function can be used to retrieve objects in Weaviate. -- The 'Get' function syntax requires specifying the class, properties, and any additional arguments related to the search criteria. -- Weaviate responses are in JSON format. -- Class and properties fields must correspond to the objects and properties defined in the Weaviate schema. -- 'Get' queries can retrieve additional properties not defined in the schema, which can be inherent to the object or related to the query performed. -- '_additional' properties can include distance, id, and vector information, providing more context and information about the retrieved objects. - -import Quiz from '/src/components/Academy/quiz.js' -export const nearText = [{ - questionText: 'questionText', - answerOptions: [ - { - answerText: 'answerOne', - isCorrect: false, - feedback: 'feedbackOne', - }, - { - answerText: 'answerTwo', - isCorrect: false, - feedback: 'feedbackTwo', - }, - { - answerText: 'answerThree', - isCorrect: false, - feedback: 'feedbackThree', - }, - ] -}]; - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/20_vector_parameters.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/20_vector_parameters.mdx deleted file mode 100644 index bea69b77c..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/20_vector_parameters.mdx +++ /dev/null @@ -1,299 +0,0 @@ ---- -title: Search operators -description: Learn to adjust vector parameters for optimized data search in Weaviate. ---- - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -Weaviate offers multiple vector search "operators", through which you can carry out vector searches. Typically, our users use one of `nearVector`, `nearObject` or `near` methods (e.g. `nearText`). We will review those methods one by one in this section. - -## `nearVector` - -The `nearVector` operator can be used to look for objects which are the most similar to an explicitly provided vector. - -The vector value is passed using the `vector` argument as shown below. - -### Example - -:::note Vector truncated for brevity -The vector shown below is truncated. If you would like to run the cell yourself, you can see the full vector below. -::: - -
- See the whole vector - -[0.023932384327054024, -0.014095712453126907, 0.013304559513926506, -0.01155742909759283, -0.01147831417620182, 0.015321999788284302, -0.025013625621795654, -0.04198386147618294, 0.0006061387248337269, -0.008940030820667744, 0.013475975953042507, 0.0021558923181146383, 0.008148877881467342, -0.0022696207743138075, 0.014623147435486317, 0.0010969009017571807, 0.032199934124946594, -0.016746075823903084, 0.007700557820498943, 0.014293501153588295, -0.004793069791048765, 0.009223527275025845, 0.01840749755501747, -0.007628035265952349, -0.0017108687898144126, -0.001233704504556954, 0.01867121458053589, -0.030696744099259377, 0.012150794267654419, -0.003728309413418174, 0.030512141063809395, -0.004667803645133972, -0.005402917042374611, -0.013779250904917717, -0.013344116508960724, -0.026727791875600815, 0.020596355199813843, -0.008043390698730946, -0.0035865609534084797, -0.007120378781110048, 0.0073379455134272575, 0.031145064160227776, -0.009137819521129131, -0.00028988346457481384, -0.009823485277593136, 0.007482990622520447, 0.0011644785990938544, -0.012856239452958107, 0.001305402722209692, 0.007911532185971737, 0.011432163417339325, 0.033914100378751755, -0.0353645458817482, -0.013845180161297321, -0.009896007366478443, -0.009717998094856739, -0.022824769839644432, 0.008280736394226551, 0.029299039393663406, -0.02130839228630066, 0.008485117927193642, 0.004756808280944824, -0.021071046590805054, 0.03462613746523857, -0.021756712347269058, 0.0076082563027739525, 0.0020471089519560337, -0.013205665163695812, 0.00041803380008786917, 0.0013144679833203554, 0.019475553184747696, 0.0022762136068195105, -0.001709220465272665, -0.0005315560265444219, 0.0039063189178705215, -0.027110181748867035, 0.008676312863826752, -0.009843263775110245, -0.02209954522550106, 0.012394732795655727, 0.016772447153925896, -0.016179082915186882, -0.012078272178769112, 0.018684400245547295, 0.02439389005303383, 0.021321577951312065, -0.012282652780413628, 0.013726507313549519, -0.030169308185577393, -0.007898345589637756, 0.0166669599711895, 0.0005167219205759466, -0.0011076144874095917, 0.028085937723517418, -0.004760105162858963, 0.0016292810905724764, -0.0074698044918477535, 0.02565973438322544, 0.0024525749031454325, -0.036393046379089355, -0.01155742909759283, 0.013937481679022312, -0.025527875870466232, -0.023549992591142654, -0.014201199635863304, -0.005650152452290058, 0.011267339810729027, 0.022152289748191833, 0.025686105713248253, -0.01213101577013731, 0.011418977752327919, 0.021743526682257652, -6.031512748450041e-05, -0.010172910988330841, -0.010120167396962643, -0.026450887322425842, 0.020042547956109047, 0.006981926970183849, -0.012295839376747608, -0.04338156431913376, 0.023958755657076836, 0.014675891026854515, 0.022442378103733063, -0.00022127565171103925, 0.012427697889506817, 0.03164612874388695, -0.020464494824409485, -0.01867121458053589, 0.0167856328189373, -0.02187538519501686, 0.009632290340960026, 0.022297333925962448, 0.007318167015910149, -0.0076148491352796555, -0.01660103164613247, 0.018499799072742462, -0.004489794373512268, 0.007779672741889954, -0.0037349022459238768, -0.022112730890512466, 0.0036524904426187277, 0.012691415846347809, -0.015638461336493492, 0.0141352703794837, -0.017220767214894295, 0.035944726318120956, 0.01574394851922989, 0.005607298109680414, 0.009625696577131748, -0.021611668169498444, 0.013581463135778904, -0.004196408204734325, 0.013541905209422112, -0.007113785482943058, 0.0005550433997996151, 0.012889203615486622, -0.010060830973088741, 0.02295662835240364, -0.0015551104443147779, -0.007786266040056944, -0.003810721216723323, 0.003098683198913932, -0.0011685991194099188, -0.01275075227022171, 0.02200724370777607, 0.03061762824654579, 0.0014512715861201286, -0.004245855379849672, -0.01964697055518627, -0.01104977261275053, -0.018104221671819687, 0.006741284392774105, -0.038871992379426956, 0.011122294701635838, -0.017181210219860077, 0.0328855998814106, 7.489171548513696e-05, 0.004971079062670469, -0.031303294003009796, -0.023906011134386063, -0.005834754556417465, 0.003945876378566027, 0.0012196945026516914, 0.009137819521129131, -0.01106955111026764, -0.028666116297245026, 0.008768614381551743, -0.0029849549755454063, 0.010153132490813732, -0.015308814123272896, -0.0037085304502397776, -0.002592674922198057, -0.010252026841044426, 0.0013853422133252025, -0.6903074979782104, -0.03264825418591499, 0.00757529167458415, -0.021769898012280464, 0.005010636989027262, 0.013528719544410706, 0.006810510065406561, 0.008940030820667744, -0.009856450371444225, -0.00023652183881495148, -0.02009529061615467, 0.023906011134386063, -0.023958755657076836, -0.005762232467532158, -0.011689288541674614, -0.014504474587738514, 0.007080820854753256, -0.018051479011774063, 0.014385801739990711, 0.012856239452958107, -0.02067546918988228, 0.02763761766254902, 0.005946834571659565, 0.020345821976661682, 0.015335185453295708, 0.00432497076690197, 0.03797535225749016, -0.013284780085086823, -0.012803495861589909, -0.00042730511631816626, -0.007094006985425949, 0.015651647001504898, -0.012645265087485313, -0.001946566510014236, 0.05511700361967087, -0.005521589890122414, -0.003125054994598031, 0.019225021824240685, 0.009071889333426952, 0.023009371012449265, -0.008300515823066235, -0.0010746497428044677, 0.021954501047730446, 0.010100388899445534, -0.016970235854387283, 0.008485117927193642, 0.04230032488703728, -0.019541483372449875, -0.009038925170898438, -0.030643999576568604, 0.0005055963410995901, 0.03618207201361656, -0.004559020046144724, 0.014807750470936298, 0.009329014457762241, 0.019000861793756485, 0.020279893651604652, 0.019501926377415657, 0.0006320984102785587, 0.0028465031646192074, -0.000699264055583626, 0.028797974810004234, -0.02056998200714588, 0.001127393334172666, -0.014794564805924892, 0.020464494824409485, -0.0181305930018425, 0.020016174763441086, -0.016653774306178093, -0.01939643919467926, 0.029773730784654617, 0.021071046590805054, -0.013845180161297321, 0.030643999576568604, 0.014952794648706913, 0.026543188840150833, 0.025198228657245636, -0.007535734213888645, -0.017827318981289864, 0.022086359560489655, 0.0005212545511312783, -0.0073313526809215546, -0.04918335750699043, -0.024090614169836044, 0.03494259715080261, -0.004084328189492226, -0.028507886454463005, 0.009137819521129131, 0.022152289748191833, 0.009203748777508736, 0.0019432699773460627, 0.027558501809835434, -0.023760966956615448, -0.02416973002254963, 0.00514249550178647, -0.03014293685555458, -0.018486613407731056, -0.007304980885237455, 0.005656745284795761, -0.008827950805425644, 0.008623569272458553, -0.016587844118475914, 0.028929835185408592, 0.01545385830104351, -0.004028288181871176, -0.012849646620452404, 0.004885370843112469, 0.017576785758137703, 0.012988097965717316, -0.021638039499521255, -0.009553174488246441, -0.008227992802858353, -0.00912463292479515, 0.020583167672157288, -0.0167856328189373, -0.02674097754061222, 0.007858788594603539, -0.014174827374517918, 0.017352625727653503, -0.017154838889837265, 0.04003234952688217, -0.007852194830775261, 0.01170247420668602, 0.004951300099492073, -0.005396323744207621, -0.005828161723911762, 0.007970868609845638, -0.03336029127240181, -0.016930678859353065, -0.0029025431722402573, -0.0031563714146614075, 0.019739272072911263, 0.009797113947570324, -0.0014908292796462774, 0.004908446222543716, 0.008274143561720848, 0.014504474587738514, -0.019409624859690666, -0.022982999682426453, -0.021980872377753258, -0.03172524273395538, -0.0033986622001975775, -0.0009329014574177563, -0.0034052550327032804, -0.008544454351067543, -0.006929183378815651, 0.0042656343430280685, 0.0009007608750835061, -0.006569867953658104, -0.029562756419181824, -0.0016894417349249125, 0.006151216104626656, -0.0074698044918477535, 0.013528719544410706, 0.011966192163527012, -0.018776701763272285, -0.003073959844186902, -0.027743104845285416, -0.02500043995678425, -0.017339440062642097, 0.023892825469374657, 0.0060753971338272095, -0.008775207214057446, -0.0004223604337312281, -0.007219272665679455, -0.010252026841044426, -0.016640588641166687, 0.015124212019145489, -0.007713743485510349, -0.010014680214226246, 0.0026569559704512358, -0.007041263394057751, -0.014913237653672695, 0.018565727397799492, -0.017405370250344276, -0.0010021273046731949, 0.0036030435003340244, 0.00573915708810091, -0.0007532437448389828, -3.229512003599666e-05, -0.010898134671151638, -0.007839009165763855, 0.001443030429072678, 0.004476608242839575, 0.015137397684156895, 0.01714165136218071, 0.03380861133337021, 0.016113152727484703, -0.009447687305510044, 0.022020429372787476, -0.005152385216206312, 0.03586561232805252, 0.006688540801405907, -0.002876171376556158, -0.012816681526601315, -0.0023124748840928078, -0.001210629241541028, -0.020978745073080063, 0.0031217585783451796, 0.018750330433249474, 0.009487245231866837, -0.022152289748191833, 0.007806044537574053, -0.02025352045893669, 0.004793069791048765, -0.016139524057507515, -0.019950246438384056, -0.029773730784654617, 0.01933050900697708, -0.012955132871866226, 0.008115912787616253, 0.0029651762451976538, 0.002904191380366683, 0.012928761541843414, 0.034837111830711365, 0.019198650494217873, -0.017919618636369705, 0.010977250523865223, -0.02038538083434105, -0.005564444232732058, -0.023892825469374657, 0.01181455422192812, 0.01020587608218193, -0.007812637835741043, -0.008181842043995857, 0.02079414203763008, -0.01039707101881504, -0.011564021930098534, 0.002449278486892581, -0.038212697952985764, 0.0007363493205048144, 0.003395365783944726, -0.00129963387735188, 0.027083810418844223, 0.029615500941872597, 0.024631235748529434, -0.008794985711574554, -0.006915997248142958, 0.029167180880904198, 0.007529140915721655, -0.0042854128405451775, 0.005640262737870216, 0.029905589297413826, -0.006101768929511309, -0.004753511864691973, 0.011267339810729027, 0.01545385830104351, -0.004868888296186924, -0.012388139963150024, 0.0011809609131887555, -0.04034881293773651, -0.010627823881804943, -0.026503631845116615, 0.00977733451873064, 0.012394732795655727, -0.0054820324294269085, -0.01822289451956749, 0.006319336127489805, 0.02391919679939747, 0.024723537266254425, 0.005158978048712015, -0.0021575407590717077, 0.007456618826836348, 0.0139111103489995, 0.007502769120037556, -0.008834543637931347, -0.0024707054253667593, 0.01596810854971409, 0.002129520522430539, 0.015823064371943474, 0.00874224305152893, -0.032991088926792145, 0.011155259795486927, -0.012183759361505508, 0.020279893651604652, 0.004845813382416964, -0.0030442913994193077, -0.0077335224486887455, 0.01327818725258112, 0.006016060709953308, 0.0021047971677035093, -0.03892473503947258, 0.007212679833173752, 0.014517661184072495, -0.014082526788115501, -0.027690360322594643, 0.0006259175133891404, 0.02604212611913681, -0.0019498629262670875, 0.0015748892910778522, -0.02111060358583927, 0.001626808661967516, 0.0029388044495135546, -0.020055733621120453, -0.008794985711574554, -0.017102094367146492, 0.008485117927193642, -0.012295839376747608, 0.017985548824071884, 0.011735438369214535, -0.0017636122647672892, 0.020055733621120453, -0.0020108476746827364, -0.0040579563938081264, 0.03449427708983421, 0.020055733621120453, -0.013416639529168606, -0.016930678859353065, 0.006180884316563606, -0.009427908807992935, 0.0046381354331970215, 0.0022531382273882627, -0.013977039605379105, -0.013113363645970821, 0.0014776433818042278, -0.004338156431913376, -0.02455211989581585, -0.02500043995678425, 0.021928129717707634, -0.005890794564038515, -0.00726542342454195, -0.014174827374517918, -0.02203361690044403, -0.018842631950974464, 0.06730076670646667, -0.010416850447654724, -0.008636755868792534, 0.021044675260782242, 0.008320294320583344, -0.004657914396375418, -0.012869425117969513, -0.013871552422642708, -0.008590605109930038, -0.004275523591786623, 0.006902811583131552, 0.003566782223060727, -0.0010655844816938043, 0.005353469867259264, 0.0153747433796525, 0.019185464829206467, 0.010172910988330841, -0.011155259795486927, 0.018869003280997276, -0.014069341123104095, 0.005894090980291367, -0.027584875002503395, 0.01314632873982191, 0.018644843250513077, 0.016047224402427673, -0.029272668063640594, 0.01675926148891449, -0.000627565779723227, -0.005386434495449066, -0.03278011456131935, -0.0004099986399523914, 0.008715870790183544, -0.008214807137846947, -0.007759894244372845, 0.002576192608103156, -0.007581884507089853, 0.001857561757788062, 0.008300515823066235, 0.005468846298754215, -0.027004694566130638, 0.014412174001336098, 0.014623147435486317, -0.0011933227069675922, -0.02038538083434105, 0.0038634645752608776, -0.006042432505637407, -0.019792014732956886, 0.021980872377753258, -0.010489372536540031, -0.0018839335534721613, 0.026991508901119232, 0.009783927351236343, 0.001620215829461813, -0.03183072805404663, 0.006981926970183849, 0.0184734258800745, 0.016099967062473297, -0.0034085516817867756, -0.01962059922516346, -0.03262188285589218, -0.01043003611266613, -0.024130171164870262, 0.02410379983484745, 0.016179082915186882, -0.02067546918988228, -0.03557552024722099, -0.0009081779280677438, -0.008373037911951542, -0.0137396939098835, 0.008986181579530239, -0.008669720031321049, -0.030327538028359413, -0.004924928303807974, -0.019449181854724884, 0.007792858872562647, 0.005449067335575819, 0.02101830206811428, -0.02079414203763008, 0.019805200397968292, 0.0044172718189656734, -0.0031992257572710514, -0.0012485386105254292, 0.004555723629891872, -0.009164190851151943, -0.0008335952297784388, 0.021638039499521255, 0.0077401152811944485, 0.0009032331872731447, -0.0077401152811944485, 0.0019877722952514887, 0.014280314557254314, -0.005017229821532965, 0.007515955250710249, -0.0070148915983736515, 0.015361557714641094, 0.02041175216436386, 0.01213101577013731, -0.0013721563154831529, 0.0015946681378409266, 0.02387963980436325, 0.0044172718189656734, -0.015664832666516304, -0.002815186744555831, -0.0028184831608086824, 0.003744791727513075, 0.01179477572441101, -0.01685156300663948, 0.008946623653173447, -0.010799241252243519, -0.017497671768069267, 0.009216934442520142, -0.02919355221092701, 0.003922800999134779, -0.01774820312857628, 0.0009922379394993186, 0.008234585635364056, 0.0011100867995992303, 0.01084539107978344, -0.010528930462896824, -0.03185710310935974, -0.007311573717743158, -0.02969461679458618, 0.0046315426006913185, 0.03750066086649895, 0.006006170995533466, -0.008267550729215145, -0.009058703668415546, -0.0047699944116175175, 0.0002550644858274609, 0.0018460240680724382, -0.002813538536429405, 0.012335396371781826, -0.009520210325717926, 0.0054523637518286705, -0.03122417815029621, -0.007107192650437355, -0.007080820854753256, -0.0042788200080394745, -0.0025069667026400566, -0.007186308037489653, -0.02092600241303444, 0.0075950706377625465, 0.01189366914331913, -0.015585717745125294, -0.006797324400395155, -0.019515112042427063, -0.0021839123219251633, 0.0021377617958933115, -0.025580618530511856, 0.03014293685555458, 0.00684347515925765, 0.0008265902288258076, -0.019515112042427063, -0.028481515124440193, -0.03541729226708412, -0.0385291613638401, -0.027400271967053413, -0.026622304692864418, 0.033281177282333374, 0.02610805444419384, 0.022152289748191833, -0.017959177494049072, 0.03349215164780617, -0.0019729381892830133, 0.005725970957428217, 0.003622822230681777, 0.022666538134217262, -0.008794985711574554, -0.024288402870297432, 0.010819019749760628, 0.005656745284795761, 0.001946566510014236, 0.010759683325886726, 0.003340973984450102, 0.0007503593224100769, 0.029905589297413826, -0.016482356935739517, 0.002592674922198057, 0.0033063609153032303, -0.027769476175308228, -0.011517872102558613, 0.0021328171715140343, -0.007542327046394348, -0.0033162503968924284, -0.012249688617885113, -0.01758997142314911, 0.03354489430785179, 0.025738850235939026, -0.005623780656605959, -0.0001497834309702739, 0.0182492658495903, -0.011735438369214535, 0.041772887110710144, -0.012506812810897827, 0.02153255231678486, -0.008834543637931347, -0.0038239071145653725, -0.002765739569440484, 0.011616765521466732, -0.0022383041214197874, 0.015823064371943474, 0.002080073580145836, 0.0016836728900671005, 0.02219184674322605, 0.0013350709341466427, 0.0074698044918477535, 0.0013152922037988901, 0.011854112148284912, 0.004334860015660524, -0.022125916555523872, -0.006457787938416004, -0.021888570860028267, -0.00907848309725523, -0.03889836370944977, -0.01592855155467987, -0.011715659871697426, 0.009731183759868145, 0.038397300988435745, -0.03934668377041817, -0.011339861899614334, 0.011821147054433823, -0.011102516204118729, 0.021123789250850677, 0.0019317322876304388, 0.02385326847434044, 0.032068073749542236, -0.011801368556916714, -0.014267128892242908, -0.02254786528646946, -0.017022978514432907, -0.005297429859638214, 0.014346243813633919, 0.013581463135778904, -0.0009889415232464671, -0.024209287017583847, -0.0007462387438863516, 0.0018921747105196118, -0.02705743908882141, -0.022244589403271675, 0.0153747433796525, 0.004483201541006565, 0.025910265743732452, -0.0155989034101367, -0.008373037911951542, -0.008168656378984451, 0.021123789250850677, 0.016231825575232506, -0.00213446537964046, -0.01660103164613247, -0.017128465697169304, -0.01694386452436447, 0.014952794648706913, 0.0010252026841044426, 0.010997029021382332, 0.008623569272458553, -0.006438008975237608, 0.0176822729408741, 0.004687582608312368, 0.009487245231866837, 0.0012749104062095284, -0.012572742998600006, 0.03132966533303261, 0.006352300755679607, 0.03148789703845978, 0.007957682013511658, 0.0002802000963129103, 0.01022565457969904, -0.00923671294003725, 0.0007136861095204949, 0.02903532236814499, -0.015387929044663906, -0.010680567473173141, 0.010515743866562843, 0.0028943021316081285, 0.0184734258800745, -0.005538072437047958, -0.0017174617387354374, 0.015229698270559311, -0.00958613958209753, 0.0035634858068078756, 0.01669333130121231, 0.0015963163459673524, -0.0008438967051915824, -0.019225021824240685, -0.004601874388754368, 0.009467466734349728, -0.025105927139520645, -0.010278398171067238, 0.01583625003695488, -0.026701420545578003, -0.020490868017077446, -0.010074017569422722, 0.0018114111153408885, 0.008584012277424335, -0.02572566457092762, 0.00958613958209753, -0.0032272457610815763, 0.025606991723179817, -0.03539091721177101, 0.016271384432911873, 0.018077850341796875, 0.022297333925962448, -0.020016174763441086, 0.004430457949638367, 0.0136869503185153, -0.011768403463065624, 0.015783505514264107, -0.011867297813296318, -0.0135155338793993, -0.0057028960436582565, -0.012566149234771729, 0.010884949006140232, -0.008748835884034634, -0.005126013420522213, -0.02086007222533226, 0.029404526576399803, -0.0021641335915774107, -0.028217796236276627, -0.012170572765171528, 0.0013952315784990788, 0.007667592726647854, 0.014860494062304497, 0.02108423225581646, -0.039610400795936584, 0.016337312757968903, -0.022442378103733063, 0.011959598399698734, 0.003134944476187229, -0.022112730890512466, 0.01984475925564766, -0.014623147435486317, 0.0172339528799057, 0.0005542192957364023, -0.02216547541320324, -0.0029503421392291784, -0.003777756355702877, -0.014161641709506512, 0.006381968967616558, -0.016904305666685104, 0.006606128998100758, 0.0135155338793993, 0.02887709066271782, 0.025606991723179817, 0.009401536546647549, 0.005946834571659565, 0.0014356133760884404, -9.935771231539547e-05, 0.0010936044855043292, -0.012684823013842106, -0.0027574985288083553, 0.020332636311650276, 0.021967686712741852, 0.014398987405002117, -0.020965559408068657, -0.010469594039022923, 0.001205684500746429, -0.018658028915524483, -0.02998470515012741, 0.0014298445312306285, -0.00483262725174427, 0.010245434008538723, 0.009197155945003033, -0.005307319108396769, 0.02536964602768421, -0.0022926959209144115, 0.01035092119127512, 0.0026800313498824835, -0.009012552909553051, 0.04053341597318649, -0.0031547232065349817, -0.006902811583131552, -0.003113517304882407, -0.031145064160227776, -0.012473848648369312, 0.010522337630391121, 0.002930563176050782, 0.011623358353972435, -0.0023306054063141346, 0.013126550242304802, 0.02718929760158062, -6.428119377233088e-05, -0.015190141275525093, -0.006711616180837154, 0.01685156300663948, 0.008438967168331146, -0.030670370906591415, -0.0014825881225988269, 0.002033923054113984, -0.007278609089553356, 0.0038964294362813234, -0.005923759192228317, 0.019805200397968292, 0.0051227170042693615, 0.004282116424292326, 0.015427486971020699, -0.034230560064315796, 0.017471298575401306, -0.040849875658750534, 0.0021542441099882126, 0.002892653690651059, 0.015321999788284302, -0.0010029515251517296, 0.010080610401928425, -0.00695555517449975, 0.01894811913371086, 0.0034645916894078255, 0.011412384919822216, -0.005719378124922514, -0.004397492855787277, 0.002235007705166936, -0.0046249497681856155, -0.011412384919822216, -0.008656534366309643, -0.00885432306677103, 0.008715870790183544, -0.014029783196747303, 0.012632079422473907, -0.007707150653004646, 0.01650873012840748, 0.023365391418337822, -0.006589646451175213, -0.022982999682426453, -0.0005290837143547833, -0.008709277957677841, -0.015704389661550522, -0.0062962607480585575, 0.0015468692872673273, -0.005379841662943363, 0.00585123710334301, -0.001642466988414526, -0.0037744599394500256, -0.010548708960413933, -0.00768737168982625, 0.008808172307908535, 0.016970235854387283, 0.023826897144317627, 0.22004607319831848, -0.025303715839982033, -0.015018724836409092, 0.03336029127240181, -0.014517661184072495, 0.018895374611020088, 0.02387963980436325, -0.004390900023281574, 0.006002874579280615, 0.014794564805924892, -0.017695458605885506, 0.027110181748867035, 0.004262337926775217, -0.00915759801864624, -0.005735860671848059, 0.0004409030661918223, -0.01917227916419506, -0.013950667344033718, 0.0026338808238506317, -0.01136623416095972, 0.007792858872562647, -0.0023685148917138577, -0.011972784996032715, -0.013192479498684406, 0.009711405262351036, -0.012730972841382027, -0.0030822008848190308, 0.0018278934294357896, 0.019027233123779297, 0.01020587608218193, 0.011155259795486927, -0.014201199635863304, -0.009902600198984146, -0.000843484653159976, 0.017563600093126297, 0.011623358353972435, 0.021387508139014244, -0.012440883554518223, 0.0011603579623624682, 0.009197155945003033, -0.022745653986930847, 0.006935776211321354, -0.009137819521129131, -0.0063094464130699635, -0.004460126161575317, -0.0011661268072202802, -0.01901404745876789, 0.010667381808161736, 0.011939819902181625, 0.009737776592373848, -0.034520652145147324, -0.014504474587738514, 0.010515743866562843, 0.04034881293773651, -0.029483642429113388, -0.0035371140111237764, 0.018750330433249474, 0.008234585635364056, -0.013752879574894905, 0.007370910607278347, 0.0017322958447039127, 0.016640588641166687, -0.016205454245209694, -0.0019531594589352608, 0.002096555894240737, 0.020266707986593246, -0.018842631950974464, -0.017669087275862694, 0.028613373637199402, 0.01806466467678547, 0.01203871425241232, 0.0010144890984520316, -0.03011656366288662, -0.0017850393196567893, -0.024538934230804443, -0.0027921113651245832, 0.011603579856455326, 0.0057984935119748116, 0.008096134290099144, 0.018262453377246857, -0.005514997057616711, -0.004799662623554468, -0.002014144090935588, -0.023760966956615448, -0.026886021718382835, -0.031303294003009796, 0.011577208526432514, -0.00917737651616335, -0.015664832666516304, 0.007819230668246746, 0.0028349654749035835, 0.006714912597090006, 0.015440672636032104, -0.004921631887555122, 0.008412595838308334, -0.0027228854596614838, 0.00585123710334301, 0.01127393264323473, -0.006378672551363707, -0.015440672636032104, -0.027611246332526207, 0.0023882936220616102, 0.0029948444571346045, -0.0015147286467254162, -0.014240757562220097, 0.0010804185876622796, -0.006546792574226856, -0.01507146842777729, 0.0015773616032674909, 0.00542269553989172, -0.005983096081763506, -0.007806044537574053, 0.008972995914518833, 0.00010152102186111733, -0.01977882906794548, 0.02140069380402565, 0.018341567367315292, -0.0009535044082440436, 0.025290530174970627, -0.0007726105395704508, -0.00203557126224041, -0.006622611545026302, 0.006935776211321354, 0.006556681822985411, -0.006658872589468956, -0.026859650388360023, -0.008096134290099144, 0.017273511737585068, -0.004041474312543869, -0.029325410723686218, -0.0063918582163751125, -0.0029025431722402573, 0.019159093499183655, -0.02378733828663826, 0.006316039711236954, 0.01930413767695427, 0.004011806100606918, -0.009355386719107628, -0.010166318155825138, -0.013416639529168606, -0.007753300946205854, 0.018170151859521866, 0.002277861814945936, 0.004948003683239222, 0.0019218429224565625, -0.01971290074288845, 0.017893247306346893, -0.002117983065545559, 0.0033508634660393, -0.02022714912891388, 0.0006386913591995835, -0.022692909464240074, -0.012302432209253311, 0.0026816795580089092, 0.03362400829792023, -0.00992237962782383, -0.01948874071240425, -0.028507886454463005, -0.0013861663173884153, 0.00684347515925765, -0.01653510145843029, 0.007542327046394348, 0.016455985605716705, -0.014372616074979305, -0.010370699688792229, 0.0004413151182234287, -0.17067810893058777, 0.013779250904917717, -0.0009724590927362442, -0.011168445460498333, 0.015308814123272896, -0.012632079422473907, -0.009408130310475826, -0.0020273299887776375, 0.006988519802689552, 0.002660252619534731, 0.006431416142731905, 0.0009312531910836697, -0.05643559247255325, -0.008748835884034634, 0.013581463135778904, 0.013344116508960724, -0.011517872102558613, 0.011768403463065624, 0.01758997142314911, -0.009282863698899746, 0.03307020291686058, -0.029615500941872597, -0.006721505429595709, -0.02079414203763008, 0.007700557820498943, -0.009526803158223629, -0.0151505833491683, 0.0028679303359240294, -0.013297966681420803, 0.0027294785249978304, -0.018552541732788086, -0.0032585621811449528, 0.04222120717167854, 0.004740326199680567, 0.03230542317032814, 0.009025739505887032, -0.008293922059237957, 0.0026849762070924044, -0.017273511737585068, 0.016034036874771118, 0.0011323379585519433, 0.020490868017077446, 0.014583590440452099, -0.0017075722571462393, -0.019963432103395462, -0.003761274041607976, -0.004628246184438467, -0.011682695709168911, 0.004687582608312368, 0.003373938612639904, 0.006368783302605152, -0.03275374323129654, -0.017220767214894295, -0.0034547022078186274, 0.037922609597444534, 0.014227570965886116, 0.024763094261288643, 0.006058914586901665, 0.007766487076878548, -0.02238963544368744, -0.028059566393494606, -0.018011920154094696, 0.001116679748520255, -0.03926756978034973, -0.0018295417539775372, -0.02423565834760666, -0.01583625003695488, 0.005432585254311562, -0.012599114328622818, 0.024116985499858856, -0.0019993099849671125, 0.0008875749772414565, 0.0007441784837283194, -0.0030162713956087828, 0.011867297813296318, 0.025422388687729836, -0.012374954298138618, 0.020504053682088852, -0.00885432306677103, -0.02500043995678425, -0.03937305510044098, 0.038871992379426956, 0.01020587608218193, -0.0008513137581758201, -0.002080073580145836, 0.0025910267140716314, -0.01583625003695488, 0.007654407061636448, -0.02194131538271904, -0.004206297919154167, 0.017181210219860077, -0.026846464723348618, 0.004921631887555122, -0.013713321648538113, 0.025870708748698235, 0.024763094261288643, 0.012236502021551132, 0.010179503820836544, 0.023576363921165466, -0.007674186024814844, -0.004203001037240028, 0.0139111103489995, 0.006550088990479708, 0.0015740651870146394, 0.031171435490250587, 0.016587844118475914, 0.014702263288199902, 3.2604162697680295e-05, 0.017985548824071884, -0.019923873245716095, 0.008293922059237957, -0.02209954522550106, -0.00284155854023993, 0.024182915687561035, -0.011148666962981224, 0.02397194132208824, -0.014095712453126907, -0.01443854533135891, 0.017537228763103485, 0.009678440168499947, 0.04754830524325371, -0.019027233123779297, -0.03043302521109581, -0.0134034538641572, -0.017260326072573662, -0.0012691415613517165, -0.09361979365348816, -0.02658274583518505, -0.006484159734100103, 0.0014405581168830395, 0.0032832857687026262, 0.005874312482774258, -0.005188646260648966, -0.00885432306677103, -0.016495544463396072, 0.014319872483611107, 0.006022653542459011, -0.02358955144882202, -0.012783716432750225, -0.020372195169329643, 0.039926864206790924, -0.0139111103489995, 0.004470015410333872, -0.013884738087654114, -0.007779672741889954, 0.018196523189544678, -0.00850489642471075, 0.002144354861229658, 0.011372826993465424, 0.008003832772374153, -0.019027233123779297, -0.006408340763300657, -0.030960461124777794, 0.028217796236276627, 0.020820515230298042, -0.007845601998269558, -0.027294784784317017, -0.002777277259156108, 0.0045425379648804665, -0.015124212019145489, -0.023985126987099648, -0.004918335471302271, 0.0015443969750776887, -0.021347949281334877, 0.028534257784485817, -0.04522759094834328, -0.012249688617885113, 0.01586262136697769, -0.0170097928494215, -0.02047768048942089, -0.005610594525933266, -0.013001283630728722, -0.0027789254672825336, 0.03014293685555458, 0.005267761647701263, -0.02734752744436264, -0.03275374323129654, -0.020082104951143265, -0.037474289536476135, 0.014148456044495106, 0.008748835884034634, 0.010443221777677536, 0.005818272475153208, 0.00779945170506835, 0.0001228965847985819, 1.5928653738228604e-05, -0.006002874579280615, 0.01020587608218193, -0.029773730784654617, 0.027822220697999, -0.007080820854753256, -0.00799723993986845, -0.03539091721177101, -0.0040579563938081264, 0.015163769014179707, -0.003744791727513075, -0.009678440168499947, 0.02302255667746067, -0.012427697889506817, 0.001398528111167252, -0.05298089236021042, -0.007001705467700958, -0.0019564558751881123, -0.006263296119868755, 0.04557042196393013, -0.008577419444918633, -0.02419610135257244, -0.00706763518974185, 0.030195679515600204, 0.00038033039891161025, 0.024314774200320244, 0.0385291613638401, 0.0186975859105587, -0.008564232848584652, 0.013647392392158508, 0.012684823013842106, -0.0033146021887660027, -0.0013803974725306034, 0.013647392392158508, -0.04330245032906532, -0.009335607290267944, 0.007839009165763855, -0.020662283524870872, -0.0015452210791409016, 0.026253100484609604, 0.020820515230298042, -0.02635858580470085, -0.0058116791769862175, -0.06608766317367554, 0.02813868224620819, 0.006896218750625849, 0.015229698270559311, 0.010258619673550129, -0.010990436188876629, 0.019673341885209084, -0.0059303524903953075, -0.0018839335534721613, 0.007944496348500252, -0.0061709946021437645, 0.01243429072201252, -0.0002400243392912671, -0.004941410850733519, 0.0034975563175976276, -0.0031481303740292788, 0.015796691179275513, 0.008458745665848255, 0.005561147350817919, 0.0002383761020610109, -0.010627823881804943, 0.021374322474002838, 0.01955466903746128, 0.002106445375829935, -0.0025646549183875322, 0.0060819899663329124, -0.01062123104929924, -0.001493301591835916, -0.01608678139746189, 0.002991548040881753, 0.014728634618222713, -0.021796269342303276, -0.024657607078552246, 0.028481515124440193, -0.028534257784485817, -0.018750330433249474, -0.014623147435486317, 0.009322421625256538, 0.005152385216206312, 0.03275374323129654, -0.007509362418204546, 0.0011076144874095917, 0.02980010211467743, 0.0032157080713659525, -0.015335185453295708, 0.01662740297615528, 0.00958613958209753, 0.022363262251019478, 0.002279510023072362, 0.011234374716877937, -0.010383885353803635, 0.026068497449159622, -0.02327308990061283, -0.004743622615933418, 0.003174502169713378, -0.006975333672016859, 0.018038293346762657, -0.0016432910924777389, -0.0019201947143301368, -0.006556681822985411, 0.02840239927172661, 0.0139111103489995, 0.017036164179444313, 0.00675776693969965, -0.0052117216400802135, -0.01860528625547886, -0.023259904235601425, -0.001598788658156991, 0.012091457843780518, -0.04541219398379326, 0.0060325427912175655, -0.014583590440452099, 0.009038925170898438, 0.0018427276518195868, -0.017603158950805664, -0.00652701361104846, 0.006922590080648661, 0.0038964294362813234, -0.029114436358213425, 0.01930413767695427, 0.03523268923163414, -0.01031795609742403, -0.036366675049066544, 0.0017339440528303385, 0.022059988230466843, 0.006006170995533466, 0.006197366397827864, 0.028771603479981422, -0.0073313526809215546, 0.010074017569422722, -0.00547873554751277, 0.003995323553681374, -0.0017388887936249375, -0.00240147951990366, -0.008880694396793842, 0.04406723380088806, -0.012790309265255928, 0.002683327766135335, 0.020833700895309448, 0.03188347443938255, -0.004288709722459316, -0.006672058254480362, 0.013304559513926506, -0.003932690713554621, -0.012315617874264717, 0.03494259715080261, -0.03125055134296417, -0.036234814673662186, -0.007239051628857851, 0.0047205472365021706, -2.2856394934933633e-05, 0.014280314557254314, 0.00726542342454195, 0.006513827946037054, -0.014280314557254314, -0.00041597351082600653, 0.005867719184607267, -0.03974226117134094, -0.02165122516453266, 0.02232370525598526, 0.0074632116593420506, 0.04622971639037132, -0.012348582036793232, -0.026028938591480255, 0.03272736817598343, 0.0184734258800745, 0.04172014445066452, 0.0034052550327032804, 0.00010394187120255083, 0.008972995914518833, 0.0014356133760884404, -0.015229698270559311, 0.002439389005303383, 0.0035041493829339743, 0.002167430007830262, 0.012684823013842106, -0.003751384560018778, 0.011755217798054218, -0.031145064160227776, 0.05263805761933327, 0.007628035265952349, -0.011175038293004036, 0.025765221565961838, -0.016165897250175476, -0.00207842537201941, 0.023233531042933464, 0.01222990918904543, -0.02795407921075821, -0.03425693139433861, 0.003929394297301769, 0.006675355136394501, 0.0353645458817482, -0.008788392879068851, -0.006975333672016859, 0.002627287758514285, -0.00023528565361630172, -0.009223527275025845, -0.013344116508960724, -0.015031910501420498, 0.027136553078889847, -0.0034876668360084295, 0.016904305666685104, 0.01662740297615528, -0.027558501809835434, -0.006975333672016859, 0.01412208378314972, 0.020359007641673088, 0.00684347515925765, -0.02347087673842907, -0.0034876668360084295, 0.00757529167458415, -0.021796269342303276, -0.026991508901119232, 0.002261379500851035, -0.0137396939098835, 0.0013045786181464791, -0.030775858089327812, 0.008452152833342552, 0.01209805067628622, 0.006411637179553509, 0.0358128659427166, -0.015242884866893291, -0.003301416290923953, -0.004516166169196367, 0.012559556402266026, -0.028323283419013023, -0.0023882936220616102, -0.0357337519526481] - -
- -import NearVectorSimple from './_snippets/academy.queries.nearVector.simple.mdx'; - - - -Before looking at the response, or running the query, think about the following: -- How many objects do you expect the response to contain? -- What properties do you expect to see in each object? -- Can you tell what the vector represents? - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "escaping the Earth's gravity (and go off into outer space, on your way to the moon, for instance)", - "question": "It takes approximately 24,840 MPH to achieve this" - }, - { - "answer": "the Milky Way", - "question": "This is the name of our own galaxy" - } - ] - } - } -} -``` - -
- -
- Explain this query - -- Because there was a `limit` of 2, the results contained (maximum of) two objects. -- Each object included `question` and `answer` properties, as specified in the `Get` function. -- And since the vector was provided externally, we have no way of telling what it represents. (Although we can infer from the results that it relates to something to do with space.) - -
- -## `nearObject` - -The `nearObject` operator can be used to look for objects which are the most similar to an existing Weaviate object. - -The object's identity is passed using the `id` argument as shown below. - -### Example - -import NearObjectSimple from './_snippets/academy.queries.nearObject.simple.mdx'; - - - -Before looking at the response, or running the query, think about the following: -- There is no vector provided here. Where does the vector come from? - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "escaping the Earth's gravity (and go off into outer space, on your way to the moon, for instance)", - "question": "It takes approximately 24,840 MPH to achieve this" - }, - { - "answer": "66,000", - "question": "Of 6,000, 66,000 or 666,000 MPH, the closest to the speed of the Earth around the sun" - } - ] - } - } -} -``` - -
- -
- Explain this query - -- When it comes to `nearObject` queries, Weaviate uses the vector of the object used as the query vector. - -
- -## `nearText` - -The `nearText` operator can be used to look for objects which are the most similar to a text input. - -The text is passed using the `concept` argument as shown below. - -### Example - -import NearTextSimple from './_snippets/academy.queries.nearText.simple.mdx'; - - - -Before looking at the response, or running the query, think about the following: -- Once again, there is no vector provided here. Where does the vector come from this time? - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "escaping the Earth's gravity (and go off into outer space, on your way to the moon, for instance)", - "question": "It takes approximately 24,840 MPH to achieve this" - }, - { - "answer": "the Milky Way", - "question": "This is the name of our own galaxy" - } - ] - } - } -} -``` - -
- -
- Explain this query - -With `nearText` Weaviate converts the input text to a vector. - -Accordingly, `nearText` requires some means of converting an input medium to a vector. This is called a `vectorizer` in Weaviate. - -:::info How to vectorizers work? -In Weaviate, vectorizers are added as optional `modules`. You will learn more about modules and vectorizers in later units. -::: - -
- -### `near` - -You saw `nearText` here as we are dealing with texts. There are additional operators available for other media, such as `nearImage` for images. We refer to them generically as the `near` operator. - -The principle with all of these operators are the same, which is to retrieve outputs closest to the input medium. - -## Adding a threshold - -Vector search is based on similarity, it does not inherently exclude any of the results. By default, vector search returns a system defined maximum number of results. To limit the number of results, set a threshold or limit. - -### Distance thresholds - -A threshold value such as `distance` or `certainty` specifies the maximum distance (or, looked from the other direction, minimum similarity) required to retrieve an object. In this example, `distance` is an `additional` parameter that limits the results. - -import GetWithAdditional from './_snippets/academy.queries.get.with.additional.mdx'; - - - -This is the response: - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.1791926, - "id": "b1645a32-0c22-5814-8f35-58f142eadf7e" - }, - "answer": "escaping the Earth's gravity (and go off into outer space, on your way to the moon, for instance)", - "question": "It takes approximately 24,840 MPH to achieve this" - }, - { - "_additional": { - "distance": 0.18123823, - "id": "ef263438-b152-5540-97f7-99f4076bd124" - }, - "answer": "the Milky Way", - "question": "This is the name of our own galaxy" - } - ] - } - } -} -``` - -
- -:::tip Exercise: Threshold distances -Try changing the one of the above queries to: -- Use a `certainty` threshold rather than `distance`. -- Return `certainty` as well as `distance`. -Do they behave as you expected? -::: - -In this context, `distance` measures the degree of difference. You might also see `certainty` used in some contexts. This is the opposite of `distance`, where higher values imply larger differences in meaning. - -We will cover later on what these values mean, exactly, and where they come form. But for now, remember that: - -- `distance` is a measure of dissimilarity (lower is more similar), and -- `certainty` is a measure of similarity (higher is more similar) - -### `limit` threshold - -Several of the earlier examples use the `limit` clause. Use `limit` to limit the number of objects returned. - -In this Python example, `with_limit()` sets a limit of two responses. - -``` -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2) -``` - - -## Review - -### Review exercise - - - -### Key takeaways - -:::info `nearVector` / `nearObject` availability -Both `nearVector` and `nearObject` are available in Weaviate at all times, whereas `nearText` is only available when a vectorizer module is enabled and in use. -::: - -- Multiple search operators are available to help perform vector searches. -- `nearVector` can be used to find objects closest to an input vector. -- `nearObject` can be used to find objects closest to an existing Weaviate object. -- `nearText` can be used to find objects closest to an input text. - - Other `near` operators are available for other object types. -- You might use a `nearVector` query when you are using your own vectorizer, or have a library of vectors already available. A `nearObject` is a useful query for finding similar objects to an existing one. - -import Quiz from '/src/components/Academy/quiz.js' -export const nearVectorFunction = [{ - questionText: 'On what basis does the nearVector operator perform a search?', - answerOptions: [ - { - answerText: 'Similarity to a given text input', - isCorrect: false, - feedback: 'That would be nearText', - }, - { - answerText: 'Similarity to a provided vector', - isCorrect: true, - feedback: 'So if you have the query vector handy, nearVector is the operator to use.', - }, - { - answerText: 'Similarity to an existing Weaviate object', - isCorrect: false, - feedback: 'That would be nearObject', - }, - ] -}]; - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/40_aggregate.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/40_aggregate.mdx deleted file mode 100644 index 5767a285d..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/40_aggregate.mdx +++ /dev/null @@ -1,357 +0,0 @@ ---- -title: Aggregate{} the result set -description: Explore aggregate queries in Weaviate to summarize data insights. ---- - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -Now that you have seen how to retrieve individual objects with `Get`, let's take a look at how to compile information with `Aggregate`. - -`Aggregate` is a powerful function that allows you to combine information from multiple objects into a single result to get a quick overview of the results. - -## About `Aggregate` queries - -### `Aggregate` function syntax - -While the overall structure of `Aggregate` queries is similar to `Get` queries, there are some important differences as the queries relate to sets of results. - -The basic syntax for `Aggregate` queries is as follows: - -import AggregateSyntax from './_snippets/academy.queries.aggregate.syntax.mdx'; - - - -Unlike a `Get` query, available properties in `Aggregate` differ according to data types of the property being queried. - -These reflect the possible operations that can be performed on different data types. For example, the available properties for a `String` property are different from those for an `Integer` property or a cross-reference. - -Let's try out some `Aggregate` queries. - -As a reminder, our objects include the following schema: - -
- See relevant schema - -import ShortSchema from './_snippets/academy.queries.schema.short.mdx'; - - - -
- -## Standalone `Aggregate` queries - -### Example 1 - -Take a look at this query: - -import AggregateStandalone from './_snippets/academy.queries.aggregate.standalone.mdx'; - - - -What kind of results do you expect to come back? - -Now, try it out yourself. - -Your query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Aggregate": { - "JeopardyQuestion": [ - { - "meta": { - "count": 10000 - } - } - ] - } - } -} -``` - -
- -### Response object from Weaviate - -This response includes an object to represent the `meta` information requested from `JeopardyQuestion` class. The `meta` object contains a `count` property, which is the total number of objects in the class. - -
- Explain this query - -This query aggregates the objects in the `JeopardyQuestion` class to obtain the total count. Since there are no restrictions, it returns the total number of objects which is 10,000. - -
- -### `meta` property - -In the above `Aggregate` query we requested a `meta` property, for the count of the objects. Note that this is not an available property of the object class itself. This is a key difference between `Aggregate` and `Get` queries. - -A `Get` query retrieves a set of individual results, so we can select properties (e.g. `id`, or one of the properties unique to the data, such as `answer`) that apply to each of those individual results. - -An `Aggregate` query, on the other hand, returns an aggregation of the results. Accordingly, we must specify a sub-property that applies to the entire set of results. - -The `meta` property is one such property. It is available for all data types, and can be used with the `count` sub-property to return the number of retrieved objects. - - -### Example 2 - -Take a look at this query: - -import AggregateStandalone2 from './_snippets/academy.queries.aggregate.standalone.2.mdx'; - - - -What fields do you expect back in the results? - -Now, try it out yourself. - -Your query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Aggregate": { - "JeopardyQuestion": [ - { - "answer": { - "count": 10000, - "topOccurrences": [ - { - "occurs": 19, - "value": "Australia" - }, - { - "occurs": 18, - "value": "Hawaii" - }, - { - "occurs": 16, - "value": "Boston" - }, - { - "occurs": 15, - "value": "French" - }, - { - "occurs": 15, - "value": "India" - } - ] - } - } - ] - } - } -} -``` -
- -
- Explain this query - -This response includes an object to represent aggregations from the `answer` property requested from `JeopardyQuestion` class. Because the property contains textual information, we can aggregate `topOccurrences` information, such as the `value` property, which is the token, as well as the number of times it `occurs`. - -
- -:::tip Available properties -The list of available properties can be found on [this page](/weaviate/api/graphql/aggregate.md) in our documentation. -::: - -## `Aggregate` with a search operator - -As we did with `Get` queries, we can also use search operators such as `nearText` in an `Aggregate` query. Take a look: - -### Example (with `nearText`) - -For example, let's say that now instead of individual questions, we would like to know something more holistic about the answers. Like how many questions might be related to this query: - -import AggregateWithNearText from './_snippets/academy.queries.aggregate.with.neartext.mdx'; - - - -Before looking at the response, or running the query, think about the following: -- How many results do you expect to be returned? -- Can you guess how an increase in the `distance` parameter would change the number of results returned? - -Now, try it out yourself. The query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Aggregate": { - "JeopardyQuestion": [ - { - "meta": { - "count": 9 - } - } - ] - } - } -} -``` - -
- -
- Explain this query - -This query aggregates the results that were restricted using the `distance` argument. This argument is a threshold that restricts the returned results to those that are relevant to the input. Without it, the search results would potentially include the entire class. - -This is called "limiting the search space". - -
- -### Limit search space - -In order to produce meaningful aggregations with a vector search, you must limit the search space. - -This is different from aggregations in, say, a relational database. In a relational database, grouping or aggregating data can be done using `groupby` with functions such as SUM, AVG, MIN, MAX, etc. This allows you to find a result set and then aggregate the results. - -However, a vector search does not inherently exclude any results. This is because a vector search retrieves results based on *degrees* of similarity. - -**Accordingly, the search space must be limited** so that only relevant results are included in the aggregation. This can be done by setting an explicit `limit` or a threshold (`distance` or `certainty`) in the query. - -## `Aggregate` with `groupBy` - -So far, we have seen how to use `Aggregate` queries to compile information relating one set of results. This can be extended with the `groupBy` argument to compile information from multiple, subsets of results. - -### Example - -For example, let's say we want to know how many questions there are for each available `value` property. We can do this by adding the `groupBy` argument to the query: - -import AggregateGroupby from './_snippets/academy.queries.aggregate.groupby.mdx'; - - - -What do you expect to see here? How will the results differ, now that we've added the `groupBy` argument? Do you notice what else has changed to the query? - -Now, try it out yourself. The query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Aggregate": { - "JeopardyQuestion": [ - { - "groupedBy": { - "path": [ - "round" - ], - "value": "Double Jeopardy!" - }, - "meta": { - "count": 5 - } - }, - { - "groupedBy": { - "path": [ - "round" - ], - "value": "Jeopardy!" - }, - "meta": { - "count": 3 - } - }, - { - "groupedBy": { - "path": [ - "round" - ], - "value": "Final Jeopardy!" - }, - "meta": { - "count": 1 - } - } - ] - } - } -} -``` - -
- -
- Explain this query - -This query supplies an additional `groupedBy` argument, as a result of which the counts are of each `round`. The query also requests `groupedBy` a property so that each count is identifiable by the `value` of each `round` group. - -
- -### `groupBy` + `groupedBy` - -Results identified by an `Aggregate` query can be further grouped by using a `groupBy` argument. This argument takes a list of properties as an argument, and will group the results by the values of those properties. - -This is a particularly useful query pattern for identifying characteristics for subsets of results of a vector search. - -When the `groupBy` argument is used, additional property `groupedBy` is made available. This property and its sub-properties can be used to identify the group that the result belongs to. - -:::note Exercise -Try out the above query again, with these changes. -- Instead of `round` try grouping by the `points` property. -- Instead of `distance`, try adding an `.with_object_limit(9)` in the method chain. Are the results the same? -::: - -## Review - -### Review exercise - -Try out the above `nearText` query again, with these changes. -- Change the distance to another value - say, to 0.1, 0.19, 0.21 or 0.25 - how do the results change? Are they in line with your expectations? - -### Key takeaways - -- The `Aggregate` function is used to compile information from multiple objects, providing an overview. -- Search operators, like `nearText`, can be used in `Aggregate` queries. - - To produce meaningful aggregations, the search space must be limited by setting an explicit limit or a threshold (distance or certainty) in the query. -- The `groupBy` argument can be used to compile information from multiple subsets of results, refining the aggregation. -- When using the groupBy argument, the additional property groupedBy is made available, helping to identify the group that the result belongs to. - -import Quiz from '/src/components/Academy/quiz.js' -export const varName = [{ - questionText: 'questionText', - answerOptions: [ - { - answerText: 'answerOne', - isCorrect: false, - feedback: 'feedbackOne', - }, - { - answerText: 'answerTwo', - isCorrect: false, - feedback: 'feedbackTwo', - }, - { - answerText: 'answerThree', - isCorrect: false, - feedback: 'feedbackThree', - }, - ] -}]; - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/50_filters.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/50_filters.mdx deleted file mode 100644 index df293d960..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/50_filters.mdx +++ /dev/null @@ -1,365 +0,0 @@ ---- -title: Filters -description: Apply filters in Weaviate to refine search results with precision. ---- - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -## Available operators - -So far, you've seen different query functions such as `Get`, and `Aggregate`, and search operators such as `nearVector`, `nearObject` and `nearText`. - -Now, let's take a look at filters. - -A filter is a way to specify additional criteria to be applied to the results. There are a number of available filters in Weaviate. - -### Available filters - -There exist many available filters, but we do not need to cover them all at this moment. For now, let's explore a few of the most commonly used filters: - -- `where`: Apply a Boolean condition to filter the available data. -- `limit`: Restrict the maximum objects to be retrieved. -- `offset`: For pagination of search results. - -## Filter data with `where` - -The `where` filter is analogous to the `WHERE` clause in a SQL query. As in the SQL clause, the `where` filter can be used to apply a boolean conditional to the data. - -### Single operand example - -We ran an example query like this earlier: - -import GetWithAdditional from './_snippets/academy.queries.get.with.additional.mdx'; - - - -Which returned these answers: - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.1791926, - "id": "b1645a32-0c22-5814-8f35-58f142eadf7e" - }, - "answer": "escaping the Earth's gravity (and go off into outer space, on your way to the moon, for instance)", - "question": "It takes approximately 24,840 MPH to achieve this" - }, - { - "_additional": { - "distance": 0.18123823, - "id": "ef263438-b152-5540-97f7-99f4076bd124" - }, - "answer": "the Milky Way", - "question": "This is the name of our own galaxy" - } - ] - } - } -} -``` - -So let's extend our query to now include a `where` argument that uses a `Like` operator. - -import FilterWhereLike from './_snippets/academy.queries.filter.where.like.mdx'; - - - -Can you guess how you would expect the earlier response to change, if at all? - -Here is the actual response: - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.18400955, - "id": "ddcc3f06-5410-5944-85c4-3cb56ab27088" - }, - "answer": "space shuttles", - "question": "These transports, first sent up in 1981, lift off like a rocket & land like a plane" - }, - { - "_additional": { - "distance": 0.2267003, - "id": "36ffe6ca-9b73-5a54-80eb-a93f01822956" - }, - "answer": "Robert Goddard", - "question": "He's been called the \"Father of Modern Rocketry\"" - } - ] - } - } -} -``` - -
- -
- Explain this query - -Observe that the results have changed. The previous results have been removed as they do not contain the text `rocket` in the `question` property. - -This approach of combining a vector search with a filter is a powerful way to find objects that are similar to a given input, but also meet additional criteria as you see. And while filtering may remove some objects which might otherwise be "closer" to the query vector than the remaining ones, it provides a powerful strategy to find the most relevant objects by removing false positive. - -
- -We can apply the query to filter the data in any number of ways. For example, consider this query: - -import FilterWhereGreater from './_snippets/academy.queries.filter.where.greater.mdx'; - - - -How do you expect that this query will be different to the earlier queries? - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.18251508, - "id": "15f06117-012c-506d-b5c5-24df2e750f35" - }, - "answer": "the Milky Way", - "points": 400, - "question": "Into the 20th Century it was thought the universe was one big galaxy--this one" - }, - { - "_additional": { - "distance": 0.19289112, - "id": "584a6c68-0ebe-561f-b32a-3a735eadf02e" - }, - "answer": "Asteroid", - "points": 400, - "question": "A 1991 photo of Gaspra taken by the Galileo probe was the first close-up of one of these minor planets" - } - ] - } - } -} -``` - -
- -
- Explain this query - -This query has been modified to only return `JeopardyQuestion` objects with a `points` value greater than 200. - -Accordingly, the returned data set is very different. - -
- -:::note Exercise -Try filtering for `JeopardyQuestion` objects with: -- a `points` value equal to 200 -- a `points` value greater than or equal to 600 - -You can find the list of available operators on [this page](/weaviate/api/graphql/filters#filter-structure). -::: - -### Multiple operands example - -The query syntax can extend to beyond a single operand to take advantage of multiple conditions: - -import FilterMultipleOperands from './_snippets/academy.queries.filter.multiple.operands.mdx'; - - - -Take a look at the `where` argument (i.e. `.with_where`). What limitations do you expect in the results? - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.2267003, - "id": "a488fbe5-c2c6-50ad-8938-4b9f20dc56d1" - }, - "answer": "Robert Goddard", - "points": 400, - "question": "He's been called the \"Father of Modern Rocketry\"" - }, - { - "_additional": { - "distance": 0.24946856, - "id": "c00decd4-4cf1-5b03-a789-a57077e082fb" - }, - "answer": "Huntsville", - "points": 1000, - "question": "A campus for the University of Alabama is here, nicknamed \"Rocket City, U.S.A.\"" - } - ] - } - } -} -``` - -
- -
- Explain this query - -This query has been modified to only return `JeopardyQuestion` objects with a `points` value great than than 400, AND include the text `rocket` in the `question` field. - -
- -You can apply these filters to an `Aggregate` query also. Try it yourself. - -:::note Exercise -Try these: -- adding a `where` filter to an `Aggregation` query, following the above pattern. -::: - -## Result pagination with `offset` - -When you query for data, you can use the `offset` operator to skip a number of results. This is useful for pagination, where you want to show a certain number of results per page. - -The `offset` operator works in conjunction with the existing `limit` operator to shows results from the `offset+1` to `offset+1+limit`. - -For example, to list the first ten results, set `limit`: 10. Then, to "display the second page of 10", set `offset`: 10, `limit`:10 and so on. - -The syntax, using `offset` is as follows: - -import FilterPaginationNeartext from './_snippets/academy.queries.filter.pagination.nearText.mdx'; - - - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "the Milky Way", - "question": "Into the 20th Century it was thought the universe was one big galaxy--this one" - }, - { - "answer": "space shuttles", - "question": "These transports, first sent up in 1981, lift off like a rocket & land like a plane" - } - ] - } - } -} -``` - -
- -
- Explain this query - -This query retrieves the next 2 results (`limit`: 2) after the first 2 results (`offset`: 2). - -We can confirm this by comparing the results of two queries with different result limits. The query below retrieves the top 4 results. The last two results from that query are the same as the result in the query that uses limit with pagination. - -```graphql -{ - Get { - JeopardyQuestion(limit: 4) { - answer - question - } - } -} -``` - -import FilterPaginationRelated from './_snippets/academy.queries.filter.pagination.related.mdx'; - - - -
- -:::tip -So, the `n` th page would have `offset`: `n*m`, `limit`: `m`, where `m` is the number of results per page. -::: - -The `offset` operator is available with all vector search functions including `Get` and `Aggregate`. - -## Review - - - - - -### Key takeaways - -- Filters are used to apply additional criteria to the results. Some commonly used filters include `where`, `limit` and `offset`. -- The `where` filter allows you to apply a boolean condition to the data being queried. It can be used with various operators like `Like`, `Greater`, `Equal`, etc. -- You can use multiple conditions within a `where` filter to further refine your query results. -- The `offset` operator can be used in conjunction with `limit` to skip results and build pagination. - -import Quiz from '/src/components/Academy/quiz.js' -export const whereUsage = [{ - questionText: 'Which filter is used to apply a boolean condition to the data in Weaviate?', - answerOptions: [ - { - answerText: 'limit', - isCorrect: false, - feedback: 'This is used to set the maximum number of objects to retrieve.', - }, - { - answerText: 'offset', - isCorrect: false, - feedback: 'This is used to skip a number of results.', - }, - { - answerText: 'where', - isCorrect: true, - feedback: 'It is similar to the WHERE clause in SQL.', - }, - ] -}]; -export const offsetExample = [{ - questionText: 'How can you combine the offset and limit operators to display the second page of results with 10 results per page?', - answerOptions: [ - { - answerText: 'Set offset: 10 and limit: 10', - isCorrect: true, - feedback: 'This would get results 11-20.', - }, - { - answerText: 'Set offset: 20 and limit: 10', - isCorrect: false, - feedback: 'This would get results 21-30.', - }, - { - answerText: 'Set offset: 10 and limit: 20', - isCorrect: false, - feedback: 'This would get results 11-30', - }, - ] -}]; - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/80_inside_queries_1.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/80_inside_queries_1.mdx deleted file mode 100644 index 4a5897021..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/80_inside_queries_1.mdx +++ /dev/null @@ -1,221 +0,0 @@ ---- -title: A look inside queries -description: Understand advanced query techniques in Weaviate for better data retrieval. ---- - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -In this unit, we have looked at various ways to construct a vector search. - -We started by learning how to use `Get` and `Aggregate` vector search functions, adding search operators such as `nearVector`, `nearObject` and `nearText`, before wrapping up with various filters such as `where`, `limit` and `offset`. - -Now let's stop to take a look at how these searches are actually performed, starting with the vector search process. -## The vector search process - -### Vector search, behind the scenes - -As the name suggests, vector search relies on vectors to perform its operations. When you perform a vector search in Weaviate, regardless of whether `nearVector`, `nearObject` or `nearText` is used, the input is converted to a vector. - -The input vector is then compared to the stored vectors in Weaviate to return the most relevant objects. - -For queries using the `nearVector` operator, Weaviate simply takes the provided vector and performs the vector search. - -For queries such as `nearObject` and `nearText` where the vector is not directly provided, Weaviate obtains the vector using a suitable method. - -#### `nearObject` - -If the `nearObject` operator is used, Weaviate retrieves the associated vector for the object, which becomes the input vector. - -Let's confirm this by performing a vector search using the `nearObject` operator, and replicating it with an equivalent `nearVector` operator. - -Here is the `nearObject` query: - -import NearobjectExample from './_snippets/academy.queries.nearobject.example.mdx'; - - - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 3.5762787e-07, - "id": "d53fd7ea-35c1-5f8d-a35a-e53511db1a2a" - }, - "answer": "meerkats", - "question": "Group of mammals seen here: [like Timon in The Lion King]" - }, - { - "_additional": { - "distance": 0.12663543, - "id": "9eaf38fe-e7f0-5da3-b703-6b44c49faf7d" - }, - "answer": "hyena", - "question": "It's the amused African mammal heard here" - } - ] - } - } -} -``` - -
- -Now, try running this `nearVector` query and compare the results: - -import NearobjectEquivalentNearvector from './_snippets/academy.queries.nearobject.equivalent.nearvector.mdx'; - - - -
- See the full `meerkat_vector` - -import MeerkatsVector from './_snippets/meerkats.vector.mdx'; - - - -
- -
- Explain this query - -If you run the second (`nearVector`) query, you will see that it returns the same results as the `nearObject` query. The distances are also identical. This is because the vector of object in the `nearObject` query is identical to the vector you specify in the `nearVector` query. - -
- -#### `nearText` - -If the `nearText` operator is used, Weaviate converts the input text to a vector to use as the input. The specific method would depend on the `vectorizer` applicable to the relevant class. Depending on the setting, Weaviate may use the `text2vec-openai` module or the `text2vec-transformers` module to do this. - -:::info Vectorizer setting -We will cover how to set the vectoriser in the next section. -::: - -import NeartextExample from './_snippets/academy.queries.neartext.example.mdx'; - - - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.1800943, - "id": "92710bd6-de3c-5220-a60a-d386b2748e28" - }, - "answer": "Two Guys and a Girl", - "question": "In 1999 an ABC sitcom dropped \"a Pizza Place\" from its name, which changed to this" - }, - { - "_additional": { - "distance": 0.18657643, - "id": "7e7a6572-02bd-529f-8943-38ccd4a2a90b" - }, - "answer": "Beavis & Butthead of the Class", - "question": "2 MTV cartoon teens join Howard Heh-Heh-Hesseman's honors program on '80's TV" - } - ] - } - } -} -``` - -
- -Now, try running this `nearVector` query and compare the results: - -import NeartextEquivalentNearvector from './_snippets/academy.queries.neartext.equivalent.nearvector.mdx'; - - - -
- See the full `popular_90s_comedy_vector` - -import Popular90sComedyVector from './_snippets/popular.90s.comedy.vector.mdx'; - - - -
- -
- Explain this query - -When you run the second (`nearVector`) query, once again the query returns the same objects as the `nearText` query as well as distances. - -In this case, Weaviate vectorizes the input text using the `text2vec-openai` module, which relies on the OpenAI inference API to convert the input to a vector. The vectorization process is deterministic, meaning that the same input text will always result in the same vector. For the `nearVector` query, we generated the vector directly using the OpenAI API and used it as the input vector, leading to the same results. - -
- -### Filtering - -Filters can be applied during the vector search process to further refine the results. - -Under the hood, Weaviate implements "pre-filtering". This means that Weaviate applies the filter on the entire database to end up with an "allow list", from which vector search results are returned. - -This, combined with efficient filtering methods, allows Weaviate to easily return the right number of results even under challenging conditions, such as when restrictive filters are applied to a large database. - -### Conceptual diagram (filtering + vector search) - -The conceptual diagram below shows how the vector search process works. When an input is provided containing filters and search operators, Weaviate first performs a pre-filtering step to obtain an "allow list" of objects. - -The input is used to determine an input vector, which may involve an additional step, such as retrieving the object's vector with `nearObject` or vectorizing the input text with `nearText`. The input vector is then compared to perform a vector search, returning the most relevant objects from the allow list. - -import SearchConceptualImg from './images/search-conceptual-dark.png'; - -Conceptual diagram of how search works - -## Review - -### Review exercise - -- Replicate a `nearObject` query with a `nearVector` query. -- Replicate a `nearText` query with a `nearVector` query. - -### Key takeaways - -- nearObject operator retrieves the associated vector for an object, while nearText converts input text to a vector based on a specified vectorizer. -- You have seen how to create equivalent `nearVector` queries to `nearObject` or `nearText` queries. -- Weaviate uses "pre-filtering", meaning that filters are applied on the entire database prior to performing vector search. - -import Quiz from '/src/components/Academy/quiz.js' -export const varName = [{ - questionText: 'questionText', - answerOptions: [ - { - answerText: 'answerOne', - isCorrect: false, - feedback: 'feedbackOne', - }, - { - answerText: 'answerTwo', - isCorrect: false, - feedback: 'feedbackTwo', - }, - { - answerText: 'answerThree', - isCorrect: false, - feedback: 'feedbackThree', - }, - ] -}]; - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/90_wrap_up.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/90_wrap_up.mdx deleted file mode 100644 index 1666acd36..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/90_wrap_up.mdx +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: Wrap-up -description: Queries Wrap-up - Part 1 ---- - -## Unit review - -In this unit, you have acquired valuable knowledge on querying Weaviate to retrieve the right objects or aggregate information effectively. We dived deep into various search operators available in Weaviate, such as `nearVector`, `nearObject`, and `nearText`, and filters that can be applied to refine your search results by focusing on specific criteria, enabling you to extract more accurate and relevant information. - -You have also learned some of the key principles around how Weaviate applies these operators to perform searches. You have learned how certain search operators are used, and how filtering works. - -Now that you are armed with knowledge about how to query Weaviate, in the next module we will learn how to build a Weaviate instance, from schema creation to data import. - -### Learning outcomes - -Now, you should be able to: -- Construct 'Get' queries to retrieve relevant objects and desired properties. -- Construct 'Aggregate' queries to retrieve aggregated properties about relevant objects. -- Differentiate and apply appropriate search operators with filters such as `nearVector`, `nearObject` and `nearText` with distance and limit thresholds. -- Add filters to queries. -- Describe how Weaviate applies search operators and filters to perform searches. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/10_get.py b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/10_get.py deleted file mode 100644 index 5e68f0000..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/10_get.py +++ /dev/null @@ -1,40 +0,0 @@ -# ===== Instantiate to edu-demo with OpenAI ===== -import weaviate - -client = weaviate.Client( - url="https://edu-demo.weaviate.network", - auth_client_secret=weaviate.auth.AuthApiKey(api_key="learn-weaviate"), # A read-only API Key for the Weaviate instance - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", # Replace this with YOUR OpenAI API key - } -) -# ===== END Instantiate to edu-demo with OpenAI ===== - -# ===== GET STANDALONE PYTHON ===== -import json -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).do() - -print(json.dumps(response, indent=2)) -# ===== END GET STANDALONE PYTHON ===== - -graphql_query = """ -# ===== GET STANDALONE GRAPHQL ===== -{ - Get { - JeopardyQuestion ( - limit: 2 - ) { - question - answer - } - } -} -# ===== END GET STANDALONE GRAPHQL ===== -""" - -response = client.query.raw(graphql_query) - -print(json.dumps(response, indent=2)) diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.groupby.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.groupby.mdx deleted file mode 100644 index 2ba6a110b..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.groupby.mdx +++ /dev/null @@ -1,47 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.aggregate( - "JeopardyQuestion", -).with_group_by_filter( - "round" -).with_fields( - "groupedBy {path value}" -).with_near_text( - {"concepts": ["Intergalactic travel"], "distance": 0.2} -).with_meta_count().do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Aggregate { - JeopardyQuestion ( - nearText: { - concepts: ["Intergalactic travel"], - distance: 0.2 - } - groupBy: ["round"] - ) { - groupedBy { - path - value - } - meta { - count - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.standalone.2.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.standalone.2.mdx deleted file mode 100644 index be11ff0e9..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.standalone.2.mdx +++ /dev/null @@ -1,36 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.aggregate( - "JeopardyQuestion" -).with_fields("answer {count topOccurrences {value occurs}}").do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Aggregate { - JeopardyQuestion { - answer { - count - topOccurrences - { - value - occurs - } - } - } - } -} -``` - - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.standalone.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.standalone.mdx deleted file mode 100644 index adf933585..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.standalone.mdx +++ /dev/null @@ -1,31 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.aggregate( - "JeopardyQuestion", -).with_meta_count().do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Aggregate { - JeopardyQuestion { - meta { - count - } - } - } -} -``` - - - diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.syntax.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.syntax.mdx deleted file mode 100644 index c686ce821..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.syntax.mdx +++ /dev/null @@ -1,31 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.aggregate( - , -).with_fields( - -)..do() -``` - - - - -```graphql -{ - Aggregate { - ( - - ) { - - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.with.neartext.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.with.neartext.mdx deleted file mode 100644 index 850a2e0b6..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.aggregate.with.neartext.mdx +++ /dev/null @@ -1,38 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.aggregate( - "JeopardyQuestion", -).with_near_text( - {"concepts": ["Intergalactic travel"], "distance": 0.2} -).with_meta_count().do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Aggregate { - JeopardyQuestion ( - nearText: { - concepts: ["Intergalactic travel"], - distance: 0.2 - } - ) { - meta { - count - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.multiple.operands.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.multiple.operands.mdx deleted file mode 100644 index 44ceba284..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.multiple.operands.mdx +++ /dev/null @@ -1,75 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer", "points"] -).with_limit(2).with_near_text( - {"concepts": "Intergalactic travel"} -).with_additional( - ["distance", "id"] -).with_where({ - "operator": "And", - "operands": [ - { - "path": ["question"], - "operator": "Like", - "valueText": "*rocket*" - }, - { - "path": ["points"], - "operator": "GreaterThan", - "valueInt": 200 - } - ] -}).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["Intergalactic travel"], - } - where: { - operator: And, - operands: [ - { - path: ["question"], - operator: Like, - valueText: "*rocket*" - } - { - path: ["points"], - operator: GreaterThan, - valueInt: 400 - }, - ] - - } - ) { - question - answer - points - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.pagination.nearText.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.pagination.nearText.mdx deleted file mode 100644 index ae519939e..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.pagination.nearText.mdx +++ /dev/null @@ -1,39 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_offset(2).with_near_text( - {"concepts": "Intergalactic travel"} -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2, - offset: 2, - nearText: { - concepts: ["Intergalactic travel"], - } - ) { - question - answer - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.pagination.related.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.pagination.related.mdx deleted file mode 100644 index f01fc4dfb..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.pagination.related.mdx +++ /dev/null @@ -1,38 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(4).with_near_text( - {"concepts": "Intergalactic travel"} -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 4 - nearText: { - concepts: ["Intergalactic travel"], - } - ) { - question - answer - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.where.greater.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.where.greater.mdx deleted file mode 100644 index 26e5c3bbc..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.where.greater.mdx +++ /dev/null @@ -1,54 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer", "points"] -).with_limit(2).with_near_text( - {"concepts": "Intergalactic travel"} -).with_additional( - ["distance", "id"] -).with_where({ - "path": ["points"], - "operator": "GreaterThan", - "valueInt": 200 -}).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["Intergalactic travel"], - } - where: { - path: ["points"], - operator: GreaterThan, - valueInt: 200 - } - ) { - question - answer - points - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.where.like.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.where.like.mdx deleted file mode 100644 index 86f1a562a..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.filter.where.like.mdx +++ /dev/null @@ -1,53 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_text( - {"concepts": "Intergalactic travel"} -).with_additional( - ["distance", "id"] -).with_where({ - "path": ["question"], - "operator": "Like", - "valueText": "*rocket*" -}).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["Intergalactic travel"], - } - where: { - path: ["question"], - operator: Like, - valueText: "*rocket*" - } - ) { - question - answer - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.standalone.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.standalone.mdx deleted file mode 100644 index e69de29bb..000000000 diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.syntax.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.syntax.mdx deleted file mode 100644 index 8b87ae81f..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.syntax.mdx +++ /dev/null @@ -1,30 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - , - [] -)..do() -``` - - - - -```graphql -{ - Get { - ( - - ) { - - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.with.additional.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.with.additional.mdx deleted file mode 100644 index 8e72efbe6..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.get.with.additional.mdx +++ /dev/null @@ -1,44 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_text( - {"concepts": "Intergalactic travel"} -).with_additional( - ["distance", "id"] -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["Intergalactic travel"], - } - ) { - question - answer - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearObject.simple.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearObject.simple.mdx deleted file mode 100644 index 57baecea7..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearObject.simple.mdx +++ /dev/null @@ -1,38 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_object( - {"id": "c8f8176c-6f9b-5461-8ab3-f3c7ce8c2f5c"} -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearObject: { - id: "c8f8176c-6f9b-5461-8ab3-f3c7ce8c2f5c" - } - ) { - question - answer - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearText.simple.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearText.simple.mdx deleted file mode 100644 index 56b16cc08..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearText.simple.mdx +++ /dev/null @@ -1,38 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_text( - {"concepts": "Intergalactic travel"} -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["Intergalactic travel"], - } - ) { - question - answer - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearVector.simple.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearVector.simple.mdx deleted file mode 100644 index bea5b33ab..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearVector.simple.mdx +++ /dev/null @@ -1,40 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -vector_input = [0.023932384327054024, -0.014095712453126907, ..., -0.0357337519526481] - -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_vector( - {"vector": vector_input} -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearVector: { - vector: [0.023932384327054024, -0.014095712453126907, ..., -0.0357337519526481] - } - ) { - question - answer - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearVector.simple.vector.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearVector.simple.vector.mdx deleted file mode 100644 index 0d536cc8b..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearVector.simple.vector.mdx +++ /dev/null @@ -1 +0,0 @@ -[0.023932384327054024, -0.014095712453126907, 0.013304559513926506, -0.01155742909759283, -0.01147831417620182, 0.015321999788284302, -0.025013625621795654, -0.04198386147618294, 0.0006061387248337269, -0.008940030820667744, 0.013475975953042507, 0.0021558923181146383, 0.008148877881467342, -0.0022696207743138075, 0.014623147435486317, 0.0010969009017571807, 0.032199934124946594, -0.016746075823903084, 0.007700557820498943, 0.014293501153588295, -0.004793069791048765, 0.009223527275025845, 0.01840749755501747, -0.007628035265952349, -0.0017108687898144126, -0.001233704504556954, 0.01867121458053589, -0.030696744099259377, 0.012150794267654419, -0.003728309413418174, 0.030512141063809395, -0.004667803645133972, -0.005402917042374611, -0.013779250904917717, -0.013344116508960724, -0.026727791875600815, 0.020596355199813843, -0.008043390698730946, -0.0035865609534084797, -0.007120378781110048, 0.0073379455134272575, 0.031145064160227776, -0.009137819521129131, -0.00028988346457481384, -0.009823485277593136, 0.007482990622520447, 0.0011644785990938544, -0.012856239452958107, 0.001305402722209692, 0.007911532185971737, 0.011432163417339325, 0.033914100378751755, -0.0353645458817482, -0.013845180161297321, -0.009896007366478443, -0.009717998094856739, -0.022824769839644432, 0.008280736394226551, 0.029299039393663406, -0.02130839228630066, 0.008485117927193642, 0.004756808280944824, -0.021071046590805054, 0.03462613746523857, -0.021756712347269058, 0.0076082563027739525, 0.0020471089519560337, -0.013205665163695812, 0.00041803380008786917, 0.0013144679833203554, 0.019475553184747696, 0.0022762136068195105, -0.001709220465272665, -0.0005315560265444219, 0.0039063189178705215, -0.027110181748867035, 0.008676312863826752, -0.009843263775110245, -0.02209954522550106, 0.012394732795655727, 0.016772447153925896, -0.016179082915186882, -0.012078272178769112, 0.018684400245547295, 0.02439389005303383, 0.021321577951312065, -0.012282652780413628, 0.013726507313549519, -0.030169308185577393, -0.007898345589637756, 0.0166669599711895, 0.0005167219205759466, -0.0011076144874095917, 0.028085937723517418, -0.004760105162858963, 0.0016292810905724764, -0.0074698044918477535, 0.02565973438322544, 0.0024525749031454325, -0.036393046379089355, -0.01155742909759283, 0.013937481679022312, -0.025527875870466232, -0.023549992591142654, -0.014201199635863304, -0.005650152452290058, 0.011267339810729027, 0.022152289748191833, 0.025686105713248253, -0.01213101577013731, 0.011418977752327919, 0.021743526682257652, -6.031512748450041e-05, -0.010172910988330841, -0.010120167396962643, -0.026450887322425842, 0.020042547956109047, 0.006981926970183849, -0.012295839376747608, -0.04338156431913376, 0.023958755657076836, 0.014675891026854515, 0.022442378103733063, -0.00022127565171103925, 0.012427697889506817, 0.03164612874388695, -0.020464494824409485, -0.01867121458053589, 0.0167856328189373, -0.02187538519501686, 0.009632290340960026, 0.022297333925962448, 0.007318167015910149, -0.0076148491352796555, -0.01660103164613247, 0.018499799072742462, -0.004489794373512268, 0.007779672741889954, -0.0037349022459238768, -0.022112730890512466, 0.0036524904426187277, 0.012691415846347809, -0.015638461336493492, 0.0141352703794837, -0.017220767214894295, 0.035944726318120956, 0.01574394851922989, 0.005607298109680414, 0.009625696577131748, -0.021611668169498444, 0.013581463135778904, -0.004196408204734325, 0.013541905209422112, -0.007113785482943058, 0.0005550433997996151, 0.012889203615486622, -0.010060830973088741, 0.02295662835240364, -0.0015551104443147779, -0.007786266040056944, -0.003810721216723323, 0.003098683198913932, -0.0011685991194099188, -0.01275075227022171, 0.02200724370777607, 0.03061762824654579, 0.0014512715861201286, -0.004245855379849672, -0.01964697055518627, -0.01104977261275053, -0.018104221671819687, 0.006741284392774105, -0.038871992379426956, 0.011122294701635838, -0.017181210219860077, 0.0328855998814106, 7.489171548513696e-05, 0.004971079062670469, -0.031303294003009796, -0.023906011134386063, -0.005834754556417465, 0.003945876378566027, 0.0012196945026516914, 0.009137819521129131, -0.01106955111026764, -0.028666116297245026, 0.008768614381551743, -0.0029849549755454063, 0.010153132490813732, -0.015308814123272896, -0.0037085304502397776, -0.002592674922198057, -0.010252026841044426, 0.0013853422133252025, -0.6903074979782104, -0.03264825418591499, 0.00757529167458415, -0.021769898012280464, 0.005010636989027262, 0.013528719544410706, 0.006810510065406561, 0.008940030820667744, -0.009856450371444225, -0.00023652183881495148, -0.02009529061615467, 0.023906011134386063, -0.023958755657076836, -0.005762232467532158, -0.011689288541674614, -0.014504474587738514, 0.007080820854753256, -0.018051479011774063, 0.014385801739990711, 0.012856239452958107, -0.02067546918988228, 0.02763761766254902, 0.005946834571659565, 0.020345821976661682, 0.015335185453295708, 0.00432497076690197, 0.03797535225749016, -0.013284780085086823, -0.012803495861589909, -0.00042730511631816626, -0.007094006985425949, 0.015651647001504898, -0.012645265087485313, -0.001946566510014236, 0.05511700361967087, -0.005521589890122414, -0.003125054994598031, 0.019225021824240685, 0.009071889333426952, 0.023009371012449265, -0.008300515823066235, -0.0010746497428044677, 0.021954501047730446, 0.010100388899445534, -0.016970235854387283, 0.008485117927193642, 0.04230032488703728, -0.019541483372449875, -0.009038925170898438, -0.030643999576568604, 0.0005055963410995901, 0.03618207201361656, -0.004559020046144724, 0.014807750470936298, 0.009329014457762241, 0.019000861793756485, 0.020279893651604652, 0.019501926377415657, 0.0006320984102785587, 0.0028465031646192074, -0.000699264055583626, 0.028797974810004234, -0.02056998200714588, 0.001127393334172666, -0.014794564805924892, 0.020464494824409485, -0.0181305930018425, 0.020016174763441086, -0.016653774306178093, -0.01939643919467926, 0.029773730784654617, 0.021071046590805054, -0.013845180161297321, 0.030643999576568604, 0.014952794648706913, 0.026543188840150833, 0.025198228657245636, -0.007535734213888645, -0.017827318981289864, 0.022086359560489655, 0.0005212545511312783, -0.0073313526809215546, -0.04918335750699043, -0.024090614169836044, 0.03494259715080261, -0.004084328189492226, -0.028507886454463005, 0.009137819521129131, 0.022152289748191833, 0.009203748777508736, 0.0019432699773460627, 0.027558501809835434, -0.023760966956615448, -0.02416973002254963, 0.00514249550178647, -0.03014293685555458, -0.018486613407731056, -0.007304980885237455, 0.005656745284795761, -0.008827950805425644, 0.008623569272458553, -0.016587844118475914, 0.028929835185408592, 0.01545385830104351, -0.004028288181871176, -0.012849646620452404, 0.004885370843112469, 0.017576785758137703, 0.012988097965717316, -0.021638039499521255, -0.009553174488246441, -0.008227992802858353, -0.00912463292479515, 0.020583167672157288, -0.0167856328189373, -0.02674097754061222, 0.007858788594603539, -0.014174827374517918, 0.017352625727653503, -0.017154838889837265, 0.04003234952688217, -0.007852194830775261, 0.01170247420668602, 0.004951300099492073, -0.005396323744207621, -0.005828161723911762, 0.007970868609845638, -0.03336029127240181, -0.016930678859353065, -0.0029025431722402573, -0.0031563714146614075, 0.019739272072911263, 0.009797113947570324, -0.0014908292796462774, 0.004908446222543716, 0.008274143561720848, 0.014504474587738514, -0.019409624859690666, -0.022982999682426453, -0.021980872377753258, -0.03172524273395538, -0.0033986622001975775, -0.0009329014574177563, -0.0034052550327032804, -0.008544454351067543, -0.006929183378815651, 0.0042656343430280685, 0.0009007608750835061, -0.006569867953658104, -0.029562756419181824, -0.0016894417349249125, 0.006151216104626656, -0.0074698044918477535, 0.013528719544410706, 0.011966192163527012, -0.018776701763272285, -0.003073959844186902, -0.027743104845285416, -0.02500043995678425, -0.017339440062642097, 0.023892825469374657, 0.0060753971338272095, -0.008775207214057446, -0.0004223604337312281, -0.007219272665679455, -0.010252026841044426, -0.016640588641166687, 0.015124212019145489, -0.007713743485510349, -0.010014680214226246, 0.0026569559704512358, -0.007041263394057751, -0.014913237653672695, 0.018565727397799492, -0.017405370250344276, -0.0010021273046731949, 0.0036030435003340244, 0.00573915708810091, -0.0007532437448389828, -3.229512003599666e-05, -0.010898134671151638, -0.007839009165763855, 0.001443030429072678, 0.004476608242839575, 0.015137397684156895, 0.01714165136218071, 0.03380861133337021, 0.016113152727484703, -0.009447687305510044, 0.022020429372787476, -0.005152385216206312, 0.03586561232805252, 0.006688540801405907, -0.002876171376556158, -0.012816681526601315, -0.0023124748840928078, -0.001210629241541028, -0.020978745073080063, 0.0031217585783451796, 0.018750330433249474, 0.009487245231866837, -0.022152289748191833, 0.007806044537574053, -0.02025352045893669, 0.004793069791048765, -0.016139524057507515, -0.019950246438384056, -0.029773730784654617, 0.01933050900697708, -0.012955132871866226, 0.008115912787616253, 0.0029651762451976538, 0.002904191380366683, 0.012928761541843414, 0.034837111830711365, 0.019198650494217873, -0.017919618636369705, 0.010977250523865223, -0.02038538083434105, -0.005564444232732058, -0.023892825469374657, 0.01181455422192812, 0.01020587608218193, -0.007812637835741043, -0.008181842043995857, 0.02079414203763008, -0.01039707101881504, -0.011564021930098534, 0.002449278486892581, -0.038212697952985764, 0.0007363493205048144, 0.003395365783944726, -0.00129963387735188, 0.027083810418844223, 0.029615500941872597, 0.024631235748529434, -0.008794985711574554, -0.006915997248142958, 0.029167180880904198, 0.007529140915721655, -0.0042854128405451775, 0.005640262737870216, 0.029905589297413826, -0.006101768929511309, -0.004753511864691973, 0.011267339810729027, 0.01545385830104351, -0.004868888296186924, -0.012388139963150024, 0.0011809609131887555, -0.04034881293773651, -0.010627823881804943, -0.026503631845116615, 0.00977733451873064, 0.012394732795655727, -0.0054820324294269085, -0.01822289451956749, 0.006319336127489805, 0.02391919679939747, 0.024723537266254425, 0.005158978048712015, -0.0021575407590717077, 0.007456618826836348, 0.0139111103489995, 0.007502769120037556, -0.008834543637931347, -0.0024707054253667593, 0.01596810854971409, 0.002129520522430539, 0.015823064371943474, 0.00874224305152893, -0.032991088926792145, 0.011155259795486927, -0.012183759361505508, 0.020279893651604652, 0.004845813382416964, -0.0030442913994193077, -0.0077335224486887455, 0.01327818725258112, 0.006016060709953308, 0.0021047971677035093, -0.03892473503947258, 0.007212679833173752, 0.014517661184072495, -0.014082526788115501, -0.027690360322594643, 0.0006259175133891404, 0.02604212611913681, -0.0019498629262670875, 0.0015748892910778522, -0.02111060358583927, 0.001626808661967516, 0.0029388044495135546, -0.020055733621120453, -0.008794985711574554, -0.017102094367146492, 0.008485117927193642, -0.012295839376747608, 0.017985548824071884, 0.011735438369214535, -0.0017636122647672892, 0.020055733621120453, -0.0020108476746827364, -0.0040579563938081264, 0.03449427708983421, 0.020055733621120453, -0.013416639529168606, -0.016930678859353065, 0.006180884316563606, -0.009427908807992935, 0.0046381354331970215, 0.0022531382273882627, -0.013977039605379105, -0.013113363645970821, 0.0014776433818042278, -0.004338156431913376, -0.02455211989581585, -0.02500043995678425, 0.021928129717707634, -0.005890794564038515, -0.00726542342454195, -0.014174827374517918, -0.02203361690044403, -0.018842631950974464, 0.06730076670646667, -0.010416850447654724, -0.008636755868792534, 0.021044675260782242, 0.008320294320583344, -0.004657914396375418, -0.012869425117969513, -0.013871552422642708, -0.008590605109930038, -0.004275523591786623, 0.006902811583131552, 0.003566782223060727, -0.0010655844816938043, 0.005353469867259264, 0.0153747433796525, 0.019185464829206467, 0.010172910988330841, -0.011155259795486927, 0.018869003280997276, -0.014069341123104095, 0.005894090980291367, -0.027584875002503395, 0.01314632873982191, 0.018644843250513077, 0.016047224402427673, -0.029272668063640594, 0.01675926148891449, -0.000627565779723227, -0.005386434495449066, -0.03278011456131935, -0.0004099986399523914, 0.008715870790183544, -0.008214807137846947, -0.007759894244372845, 0.002576192608103156, -0.007581884507089853, 0.001857561757788062, 0.008300515823066235, 0.005468846298754215, -0.027004694566130638, 0.014412174001336098, 0.014623147435486317, -0.0011933227069675922, -0.02038538083434105, 0.0038634645752608776, -0.006042432505637407, -0.019792014732956886, 0.021980872377753258, -0.010489372536540031, -0.0018839335534721613, 0.026991508901119232, 0.009783927351236343, 0.001620215829461813, -0.03183072805404663, 0.006981926970183849, 0.0184734258800745, 0.016099967062473297, -0.0034085516817867756, -0.01962059922516346, -0.03262188285589218, -0.01043003611266613, -0.024130171164870262, 0.02410379983484745, 0.016179082915186882, -0.02067546918988228, -0.03557552024722099, -0.0009081779280677438, -0.008373037911951542, -0.0137396939098835, 0.008986181579530239, -0.008669720031321049, -0.030327538028359413, -0.004924928303807974, -0.019449181854724884, 0.007792858872562647, 0.005449067335575819, 0.02101830206811428, -0.02079414203763008, 0.019805200397968292, 0.0044172718189656734, -0.0031992257572710514, -0.0012485386105254292, 0.004555723629891872, -0.009164190851151943, -0.0008335952297784388, 0.021638039499521255, 0.0077401152811944485, 0.0009032331872731447, -0.0077401152811944485, 0.0019877722952514887, 0.014280314557254314, -0.005017229821532965, 0.007515955250710249, -0.0070148915983736515, 0.015361557714641094, 0.02041175216436386, 0.01213101577013731, -0.0013721563154831529, 0.0015946681378409266, 0.02387963980436325, 0.0044172718189656734, -0.015664832666516304, -0.002815186744555831, -0.0028184831608086824, 0.003744791727513075, 0.01179477572441101, -0.01685156300663948, 0.008946623653173447, -0.010799241252243519, -0.017497671768069267, 0.009216934442520142, -0.02919355221092701, 0.003922800999134779, -0.01774820312857628, 0.0009922379394993186, 0.008234585635364056, 0.0011100867995992303, 0.01084539107978344, -0.010528930462896824, -0.03185710310935974, -0.007311573717743158, -0.02969461679458618, 0.0046315426006913185, 0.03750066086649895, 0.006006170995533466, -0.008267550729215145, -0.009058703668415546, -0.0047699944116175175, 0.0002550644858274609, 0.0018460240680724382, -0.002813538536429405, 0.012335396371781826, -0.009520210325717926, 0.0054523637518286705, -0.03122417815029621, -0.007107192650437355, -0.007080820854753256, -0.0042788200080394745, -0.0025069667026400566, -0.007186308037489653, -0.02092600241303444, 0.0075950706377625465, 0.01189366914331913, -0.015585717745125294, -0.006797324400395155, -0.019515112042427063, -0.0021839123219251633, 0.0021377617958933115, -0.025580618530511856, 0.03014293685555458, 0.00684347515925765, 0.0008265902288258076, -0.019515112042427063, -0.028481515124440193, -0.03541729226708412, -0.0385291613638401, -0.027400271967053413, -0.026622304692864418, 0.033281177282333374, 0.02610805444419384, 0.022152289748191833, -0.017959177494049072, 0.03349215164780617, -0.0019729381892830133, 0.005725970957428217, 0.003622822230681777, 0.022666538134217262, -0.008794985711574554, -0.024288402870297432, 0.010819019749760628, 0.005656745284795761, 0.001946566510014236, 0.010759683325886726, 0.003340973984450102, 0.0007503593224100769, 0.029905589297413826, -0.016482356935739517, 0.002592674922198057, 0.0033063609153032303, -0.027769476175308228, -0.011517872102558613, 0.0021328171715140343, -0.007542327046394348, -0.0033162503968924284, -0.012249688617885113, -0.01758997142314911, 0.03354489430785179, 0.025738850235939026, -0.005623780656605959, -0.0001497834309702739, 0.0182492658495903, -0.011735438369214535, 0.041772887110710144, -0.012506812810897827, 0.02153255231678486, -0.008834543637931347, -0.0038239071145653725, -0.002765739569440484, 0.011616765521466732, -0.0022383041214197874, 0.015823064371943474, 0.002080073580145836, 0.0016836728900671005, 0.02219184674322605, 0.0013350709341466427, 0.0074698044918477535, 0.0013152922037988901, 0.011854112148284912, 0.004334860015660524, -0.022125916555523872, -0.006457787938416004, -0.021888570860028267, -0.00907848309725523, -0.03889836370944977, -0.01592855155467987, -0.011715659871697426, 0.009731183759868145, 0.038397300988435745, -0.03934668377041817, -0.011339861899614334, 0.011821147054433823, -0.011102516204118729, 0.021123789250850677, 0.0019317322876304388, 0.02385326847434044, 0.032068073749542236, -0.011801368556916714, -0.014267128892242908, -0.02254786528646946, -0.017022978514432907, -0.005297429859638214, 0.014346243813633919, 0.013581463135778904, -0.0009889415232464671, -0.024209287017583847, -0.0007462387438863516, 0.0018921747105196118, -0.02705743908882141, -0.022244589403271675, 0.0153747433796525, 0.004483201541006565, 0.025910265743732452, -0.0155989034101367, -0.008373037911951542, -0.008168656378984451, 0.021123789250850677, 0.016231825575232506, -0.00213446537964046, -0.01660103164613247, -0.017128465697169304, -0.01694386452436447, 0.014952794648706913, 0.0010252026841044426, 0.010997029021382332, 0.008623569272458553, -0.006438008975237608, 0.0176822729408741, 0.004687582608312368, 0.009487245231866837, 0.0012749104062095284, -0.012572742998600006, 0.03132966533303261, 0.006352300755679607, 0.03148789703845978, 0.007957682013511658, 0.0002802000963129103, 0.01022565457969904, -0.00923671294003725, 0.0007136861095204949, 0.02903532236814499, -0.015387929044663906, -0.010680567473173141, 0.010515743866562843, 0.0028943021316081285, 0.0184734258800745, -0.005538072437047958, -0.0017174617387354374, 0.015229698270559311, -0.00958613958209753, 0.0035634858068078756, 0.01669333130121231, 0.0015963163459673524, -0.0008438967051915824, -0.019225021824240685, -0.004601874388754368, 0.009467466734349728, -0.025105927139520645, -0.010278398171067238, 0.01583625003695488, -0.026701420545578003, -0.020490868017077446, -0.010074017569422722, 0.0018114111153408885, 0.008584012277424335, -0.02572566457092762, 0.00958613958209753, -0.0032272457610815763, 0.025606991723179817, -0.03539091721177101, 0.016271384432911873, 0.018077850341796875, 0.022297333925962448, -0.020016174763441086, 0.004430457949638367, 0.0136869503185153, -0.011768403463065624, 0.015783505514264107, -0.011867297813296318, -0.0135155338793993, -0.0057028960436582565, -0.012566149234771729, 0.010884949006140232, -0.008748835884034634, -0.005126013420522213, -0.02086007222533226, 0.029404526576399803, -0.0021641335915774107, -0.028217796236276627, -0.012170572765171528, 0.0013952315784990788, 0.007667592726647854, 0.014860494062304497, 0.02108423225581646, -0.039610400795936584, 0.016337312757968903, -0.022442378103733063, 0.011959598399698734, 0.003134944476187229, -0.022112730890512466, 0.01984475925564766, -0.014623147435486317, 0.0172339528799057, 0.0005542192957364023, -0.02216547541320324, -0.0029503421392291784, -0.003777756355702877, -0.014161641709506512, 0.006381968967616558, -0.016904305666685104, 0.006606128998100758, 0.0135155338793993, 0.02887709066271782, 0.025606991723179817, 0.009401536546647549, 0.005946834571659565, 0.0014356133760884404, -9.935771231539547e-05, 0.0010936044855043292, -0.012684823013842106, -0.0027574985288083553, 0.020332636311650276, 0.021967686712741852, 0.014398987405002117, -0.020965559408068657, -0.010469594039022923, 0.001205684500746429, -0.018658028915524483, -0.02998470515012741, 0.0014298445312306285, -0.00483262725174427, 0.010245434008538723, 0.009197155945003033, -0.005307319108396769, 0.02536964602768421, -0.0022926959209144115, 0.01035092119127512, 0.0026800313498824835, -0.009012552909553051, 0.04053341597318649, -0.0031547232065349817, -0.006902811583131552, -0.003113517304882407, -0.031145064160227776, -0.012473848648369312, 0.010522337630391121, 0.002930563176050782, 0.011623358353972435, -0.0023306054063141346, 0.013126550242304802, 0.02718929760158062, -6.428119377233088e-05, -0.015190141275525093, -0.006711616180837154, 0.01685156300663948, 0.008438967168331146, -0.030670370906591415, -0.0014825881225988269, 0.002033923054113984, -0.007278609089553356, 0.0038964294362813234, -0.005923759192228317, 0.019805200397968292, 0.0051227170042693615, 0.004282116424292326, 0.015427486971020699, -0.034230560064315796, 0.017471298575401306, -0.040849875658750534, 0.0021542441099882126, 0.002892653690651059, 0.015321999788284302, -0.0010029515251517296, 0.010080610401928425, -0.00695555517449975, 0.01894811913371086, 0.0034645916894078255, 0.011412384919822216, -0.005719378124922514, -0.004397492855787277, 0.002235007705166936, -0.0046249497681856155, -0.011412384919822216, -0.008656534366309643, -0.00885432306677103, 0.008715870790183544, -0.014029783196747303, 0.012632079422473907, -0.007707150653004646, 0.01650873012840748, 0.023365391418337822, -0.006589646451175213, -0.022982999682426453, -0.0005290837143547833, -0.008709277957677841, -0.015704389661550522, -0.0062962607480585575, 0.0015468692872673273, -0.005379841662943363, 0.00585123710334301, -0.001642466988414526, -0.0037744599394500256, -0.010548708960413933, -0.00768737168982625, 0.008808172307908535, 0.016970235854387283, 0.023826897144317627, 0.22004607319831848, -0.025303715839982033, -0.015018724836409092, 0.03336029127240181, -0.014517661184072495, 0.018895374611020088, 0.02387963980436325, -0.004390900023281574, 0.006002874579280615, 0.014794564805924892, -0.017695458605885506, 0.027110181748867035, 0.004262337926775217, -0.00915759801864624, -0.005735860671848059, 0.0004409030661918223, -0.01917227916419506, -0.013950667344033718, 0.0026338808238506317, -0.01136623416095972, 0.007792858872562647, -0.0023685148917138577, -0.011972784996032715, -0.013192479498684406, 0.009711405262351036, -0.012730972841382027, -0.0030822008848190308, 0.0018278934294357896, 0.019027233123779297, 0.01020587608218193, 0.011155259795486927, -0.014201199635863304, -0.009902600198984146, -0.000843484653159976, 0.017563600093126297, 0.011623358353972435, 0.021387508139014244, -0.012440883554518223, 0.0011603579623624682, 0.009197155945003033, -0.022745653986930847, 0.006935776211321354, -0.009137819521129131, -0.0063094464130699635, -0.004460126161575317, -0.0011661268072202802, -0.01901404745876789, 0.010667381808161736, 0.011939819902181625, 0.009737776592373848, -0.034520652145147324, -0.014504474587738514, 0.010515743866562843, 0.04034881293773651, -0.029483642429113388, -0.0035371140111237764, 0.018750330433249474, 0.008234585635364056, -0.013752879574894905, 0.007370910607278347, 0.0017322958447039127, 0.016640588641166687, -0.016205454245209694, -0.0019531594589352608, 0.002096555894240737, 0.020266707986593246, -0.018842631950974464, -0.017669087275862694, 0.028613373637199402, 0.01806466467678547, 0.01203871425241232, 0.0010144890984520316, -0.03011656366288662, -0.0017850393196567893, -0.024538934230804443, -0.0027921113651245832, 0.011603579856455326, 0.0057984935119748116, 0.008096134290099144, 0.018262453377246857, -0.005514997057616711, -0.004799662623554468, -0.002014144090935588, -0.023760966956615448, -0.026886021718382835, -0.031303294003009796, 0.011577208526432514, -0.00917737651616335, -0.015664832666516304, 0.007819230668246746, 0.0028349654749035835, 0.006714912597090006, 0.015440672636032104, -0.004921631887555122, 0.008412595838308334, -0.0027228854596614838, 0.00585123710334301, 0.01127393264323473, -0.006378672551363707, -0.015440672636032104, -0.027611246332526207, 0.0023882936220616102, 0.0029948444571346045, -0.0015147286467254162, -0.014240757562220097, 0.0010804185876622796, -0.006546792574226856, -0.01507146842777729, 0.0015773616032674909, 0.00542269553989172, -0.005983096081763506, -0.007806044537574053, 0.008972995914518833, 0.00010152102186111733, -0.01977882906794548, 0.02140069380402565, 0.018341567367315292, -0.0009535044082440436, 0.025290530174970627, -0.0007726105395704508, -0.00203557126224041, -0.006622611545026302, 0.006935776211321354, 0.006556681822985411, -0.006658872589468956, -0.026859650388360023, -0.008096134290099144, 0.017273511737585068, -0.004041474312543869, -0.029325410723686218, -0.0063918582163751125, -0.0029025431722402573, 0.019159093499183655, -0.02378733828663826, 0.006316039711236954, 0.01930413767695427, 0.004011806100606918, -0.009355386719107628, -0.010166318155825138, -0.013416639529168606, -0.007753300946205854, 0.018170151859521866, 0.002277861814945936, 0.004948003683239222, 0.0019218429224565625, -0.01971290074288845, 0.017893247306346893, -0.002117983065545559, 0.0033508634660393, -0.02022714912891388, 0.0006386913591995835, -0.022692909464240074, -0.012302432209253311, 0.0026816795580089092, 0.03362400829792023, -0.00992237962782383, -0.01948874071240425, -0.028507886454463005, -0.0013861663173884153, 0.00684347515925765, -0.01653510145843029, 0.007542327046394348, 0.016455985605716705, -0.014372616074979305, -0.010370699688792229, 0.0004413151182234287, -0.17067810893058777, 0.013779250904917717, -0.0009724590927362442, -0.011168445460498333, 0.015308814123272896, -0.012632079422473907, -0.009408130310475826, -0.0020273299887776375, 0.006988519802689552, 0.002660252619534731, 0.006431416142731905, 0.0009312531910836697, -0.05643559247255325, -0.008748835884034634, 0.013581463135778904, 0.013344116508960724, -0.011517872102558613, 0.011768403463065624, 0.01758997142314911, -0.009282863698899746, 0.03307020291686058, -0.029615500941872597, -0.006721505429595709, -0.02079414203763008, 0.007700557820498943, -0.009526803158223629, -0.0151505833491683, 0.0028679303359240294, -0.013297966681420803, 0.0027294785249978304, -0.018552541732788086, -0.0032585621811449528, 0.04222120717167854, 0.004740326199680567, 0.03230542317032814, 0.009025739505887032, -0.008293922059237957, 0.0026849762070924044, -0.017273511737585068, 0.016034036874771118, 0.0011323379585519433, 0.020490868017077446, 0.014583590440452099, -0.0017075722571462393, -0.019963432103395462, -0.003761274041607976, -0.004628246184438467, -0.011682695709168911, 0.004687582608312368, 0.003373938612639904, 0.006368783302605152, -0.03275374323129654, -0.017220767214894295, -0.0034547022078186274, 0.037922609597444534, 0.014227570965886116, 0.024763094261288643, 0.006058914586901665, 0.007766487076878548, -0.02238963544368744, -0.028059566393494606, -0.018011920154094696, 0.001116679748520255, -0.03926756978034973, -0.0018295417539775372, -0.02423565834760666, -0.01583625003695488, 0.005432585254311562, -0.012599114328622818, 0.024116985499858856, -0.0019993099849671125, 0.0008875749772414565, 0.0007441784837283194, -0.0030162713956087828, 0.011867297813296318, 0.025422388687729836, -0.012374954298138618, 0.020504053682088852, -0.00885432306677103, -0.02500043995678425, -0.03937305510044098, 0.038871992379426956, 0.01020587608218193, -0.0008513137581758201, -0.002080073580145836, 0.0025910267140716314, -0.01583625003695488, 0.007654407061636448, -0.02194131538271904, -0.004206297919154167, 0.017181210219860077, -0.026846464723348618, 0.004921631887555122, -0.013713321648538113, 0.025870708748698235, 0.024763094261288643, 0.012236502021551132, 0.010179503820836544, 0.023576363921165466, -0.007674186024814844, -0.004203001037240028, 0.0139111103489995, 0.006550088990479708, 0.0015740651870146394, 0.031171435490250587, 0.016587844118475914, 0.014702263288199902, 3.2604162697680295e-05, 0.017985548824071884, -0.019923873245716095, 0.008293922059237957, -0.02209954522550106, -0.00284155854023993, 0.024182915687561035, -0.011148666962981224, 0.02397194132208824, -0.014095712453126907, -0.01443854533135891, 0.017537228763103485, 0.009678440168499947, 0.04754830524325371, -0.019027233123779297, -0.03043302521109581, -0.0134034538641572, -0.017260326072573662, -0.0012691415613517165, -0.09361979365348816, -0.02658274583518505, -0.006484159734100103, 0.0014405581168830395, 0.0032832857687026262, 0.005874312482774258, -0.005188646260648966, -0.00885432306677103, -0.016495544463396072, 0.014319872483611107, 0.006022653542459011, -0.02358955144882202, -0.012783716432750225, -0.020372195169329643, 0.039926864206790924, -0.0139111103489995, 0.004470015410333872, -0.013884738087654114, -0.007779672741889954, 0.018196523189544678, -0.00850489642471075, 0.002144354861229658, 0.011372826993465424, 0.008003832772374153, -0.019027233123779297, -0.006408340763300657, -0.030960461124777794, 0.028217796236276627, 0.020820515230298042, -0.007845601998269558, -0.027294784784317017, -0.002777277259156108, 0.0045425379648804665, -0.015124212019145489, -0.023985126987099648, -0.004918335471302271, 0.0015443969750776887, -0.021347949281334877, 0.028534257784485817, -0.04522759094834328, -0.012249688617885113, 0.01586262136697769, -0.0170097928494215, -0.02047768048942089, -0.005610594525933266, -0.013001283630728722, -0.0027789254672825336, 0.03014293685555458, 0.005267761647701263, -0.02734752744436264, -0.03275374323129654, -0.020082104951143265, -0.037474289536476135, 0.014148456044495106, 0.008748835884034634, 0.010443221777677536, 0.005818272475153208, 0.00779945170506835, 0.0001228965847985819, 1.5928653738228604e-05, -0.006002874579280615, 0.01020587608218193, -0.029773730784654617, 0.027822220697999, -0.007080820854753256, -0.00799723993986845, -0.03539091721177101, -0.0040579563938081264, 0.015163769014179707, -0.003744791727513075, -0.009678440168499947, 0.02302255667746067, -0.012427697889506817, 0.001398528111167252, -0.05298089236021042, -0.007001705467700958, -0.0019564558751881123, -0.006263296119868755, 0.04557042196393013, -0.008577419444918633, -0.02419610135257244, -0.00706763518974185, 0.030195679515600204, 0.00038033039891161025, 0.024314774200320244, 0.0385291613638401, 0.0186975859105587, -0.008564232848584652, 0.013647392392158508, 0.012684823013842106, -0.0033146021887660027, -0.0013803974725306034, 0.013647392392158508, -0.04330245032906532, -0.009335607290267944, 0.007839009165763855, -0.020662283524870872, -0.0015452210791409016, 0.026253100484609604, 0.020820515230298042, -0.02635858580470085, -0.0058116791769862175, -0.06608766317367554, 0.02813868224620819, 0.006896218750625849, 0.015229698270559311, 0.010258619673550129, -0.010990436188876629, 0.019673341885209084, -0.0059303524903953075, -0.0018839335534721613, 0.007944496348500252, -0.0061709946021437645, 0.01243429072201252, -0.0002400243392912671, -0.004941410850733519, 0.0034975563175976276, -0.0031481303740292788, 0.015796691179275513, 0.008458745665848255, 0.005561147350817919, 0.0002383761020610109, -0.010627823881804943, 0.021374322474002838, 0.01955466903746128, 0.002106445375829935, -0.0025646549183875322, 0.0060819899663329124, -0.01062123104929924, -0.001493301591835916, -0.01608678139746189, 0.002991548040881753, 0.014728634618222713, -0.021796269342303276, -0.024657607078552246, 0.028481515124440193, -0.028534257784485817, -0.018750330433249474, -0.014623147435486317, 0.009322421625256538, 0.005152385216206312, 0.03275374323129654, -0.007509362418204546, 0.0011076144874095917, 0.02980010211467743, 0.0032157080713659525, -0.015335185453295708, 0.01662740297615528, 0.00958613958209753, 0.022363262251019478, 0.002279510023072362, 0.011234374716877937, -0.010383885353803635, 0.026068497449159622, -0.02327308990061283, -0.004743622615933418, 0.003174502169713378, -0.006975333672016859, 0.018038293346762657, -0.0016432910924777389, -0.0019201947143301368, -0.006556681822985411, 0.02840239927172661, 0.0139111103489995, 0.017036164179444313, 0.00675776693969965, -0.0052117216400802135, -0.01860528625547886, -0.023259904235601425, -0.001598788658156991, 0.012091457843780518, -0.04541219398379326, 0.0060325427912175655, -0.014583590440452099, 0.009038925170898438, 0.0018427276518195868, -0.017603158950805664, -0.00652701361104846, 0.006922590080648661, 0.0038964294362813234, -0.029114436358213425, 0.01930413767695427, 0.03523268923163414, -0.01031795609742403, -0.036366675049066544, 0.0017339440528303385, 0.022059988230466843, 0.006006170995533466, 0.006197366397827864, 0.028771603479981422, -0.0073313526809215546, 0.010074017569422722, -0.00547873554751277, 0.003995323553681374, -0.0017388887936249375, -0.00240147951990366, -0.008880694396793842, 0.04406723380088806, -0.012790309265255928, 0.002683327766135335, 0.020833700895309448, 0.03188347443938255, -0.004288709722459316, -0.006672058254480362, 0.013304559513926506, -0.003932690713554621, -0.012315617874264717, 0.03494259715080261, -0.03125055134296417, -0.036234814673662186, -0.007239051628857851, 0.0047205472365021706, -2.2856394934933633e-05, 0.014280314557254314, 0.00726542342454195, 0.006513827946037054, -0.014280314557254314, -0.00041597351082600653, 0.005867719184607267, -0.03974226117134094, -0.02165122516453266, 0.02232370525598526, 0.0074632116593420506, 0.04622971639037132, -0.012348582036793232, -0.026028938591480255, 0.03272736817598343, 0.0184734258800745, 0.04172014445066452, 0.0034052550327032804, 0.00010394187120255083, 0.008972995914518833, 0.0014356133760884404, -0.015229698270559311, 0.002439389005303383, 0.0035041493829339743, 0.002167430007830262, 0.012684823013842106, -0.003751384560018778, 0.011755217798054218, -0.031145064160227776, 0.05263805761933327, 0.007628035265952349, -0.011175038293004036, 0.025765221565961838, -0.016165897250175476, -0.00207842537201941, 0.023233531042933464, 0.01222990918904543, -0.02795407921075821, -0.03425693139433861, 0.003929394297301769, 0.006675355136394501, 0.0353645458817482, -0.008788392879068851, -0.006975333672016859, 0.002627287758514285, -0.00023528565361630172, -0.009223527275025845, -0.013344116508960724, -0.015031910501420498, 0.027136553078889847, -0.0034876668360084295, 0.016904305666685104, 0.01662740297615528, -0.027558501809835434, -0.006975333672016859, 0.01412208378314972, 0.020359007641673088, 0.00684347515925765, -0.02347087673842907, -0.0034876668360084295, 0.00757529167458415, -0.021796269342303276, -0.026991508901119232, 0.002261379500851035, -0.0137396939098835, 0.0013045786181464791, -0.030775858089327812, 0.008452152833342552, 0.01209805067628622, 0.006411637179553509, 0.0358128659427166, -0.015242884866893291, -0.003301416290923953, -0.004516166169196367, 0.012559556402266026, -0.028323283419013023, -0.0023882936220616102, -0.0357337519526481] \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearobject.equivalent.nearvector.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearobject.equivalent.nearvector.mdx deleted file mode 100644 index c22898f46..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearobject.equivalent.nearvector.mdx +++ /dev/null @@ -1,58 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -To run the `nearVector` query, you need to provide the full vector. You can view and copy the full vector below. - -```python - -# Shortened for clarity - -meerkat_vector = [-0.024790961, -0.01860295, ..., -0.008665809,-0.02698336] - -``` - -This is the query. - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_vector( - {"vector": meerkat_vector} -).with_additional( - ["distance", "id"] -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -To run the `nearVector` query, first replace `` with the full vector. You can view and copy the full vector below. - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearVector: { - vector: - } - ) { - question - answer - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearobject.example.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearobject.example.mdx deleted file mode 100644 index 28dc435a2..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.nearobject.example.mdx +++ /dev/null @@ -1,44 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_object( - {"id": "d53fd7ea-35c1-5f8d-a35a-e53511db1a2a"} -).with_additional( - ["distance", "id"] -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearObject: { - id: "d53fd7ea-35c1-5f8d-a35a-e53511db1a2a" - } - ) { - question - answer - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.neartext.equivalent.nearvector.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.neartext.equivalent.nearvector.mdx deleted file mode 100644 index 33bd4968c..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.neartext.equivalent.nearvector.mdx +++ /dev/null @@ -1,44 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_vector( - {"vector": popular_90s_comedy_vector} -).with_additional( - ["distance", "id"] -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearVector: { - vector: - } - ) { - question - answer - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.neartext.example.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.neartext.example.mdx deleted file mode 100644 index 3794bb274..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.neartext.example.mdx +++ /dev/null @@ -1,44 +0,0 @@ -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(2).with_near_text( - {"concepts": ["popular 90s comedy"]} -).with_additional( - ["distance", "id"] -).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["popular 90s comedy"] - } - ) { - question - answer - _additional { - distance - id - } - } - } -} -``` - - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.schema.short.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.schema.short.mdx deleted file mode 100644 index 51b61cb3e..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/academy.queries.schema.short.mdx +++ /dev/null @@ -1,28 +0,0 @@ -```json -{ - "classes": [ - { - "class": "JeopardyQuestion", - "properties": [ - { - "dataType": ["text"], - "name": "question", - ... // Truncated - }, - { - "dataType": ["text"], - "name": "answer", - ... // Truncated - }, - { - "dataType": ["int"], - "name": "points" - ... // Truncated - }, - ... // Truncated - ], - ... // Truncated - } - ] -} -``` diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/meerkats.vector.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/meerkats.vector.mdx deleted file mode 100644 index 2425e69e6..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/meerkats.vector.mdx +++ /dev/null @@ -1 +0,0 @@ -[-0.024790961,-0.01860295,-0.006920972,-0.042420954,-0.002649688,0.037932377,0.011636574,-0.018356469,-0.014788956,-0.017591074,0.017629992,-0.0016345689,0.015424622,-0.017435402,-0.0010686318,0.0056236954,0.019536989,0.013880863,0.030953025,-0.006188011,-0.0024226645,0.003998856,-0.008944724,-0.02750227,0.021820197,-0.011383604,0.014347882,-0.023805032,-0.015165167,0.015619214,-0.0012648449,-0.02000401,0.006333954,-0.020925077,-0.015735969,-0.009204179,0.011150095,-0.03832156,0.024959607,-0.007861498,-0.017201891,0.007783661,0.0070312405,-0.015281921,-0.009327421,0.0050723525,0.018992133,-0.01574894,-0.02089913,0.026438503,0.018577006,0.0120517025,-0.02482988,-0.000100589634,0.0076733925,0.0010516051,-0.003969667,-0.006074499,-0.006901513,-0.025737973,0.009736063,0.0026091482,-0.025919592,-0.0035869705,-0.013828971,-0.010443079,-0.019381316,0.007790148,0.0070247543,-0.009405257,0.03442973,0.006175038,0.006525303,0.00085620274,0.020276438,-0.01447761,-0.009710117,0.013478707,0.005724234,-0.007135023,0.025050417,-0.012434399,-0.014088427,0.027139032,0.013647352,-0.0011489008,0.012447371,0.027294705,0.011273336,0.017357565,-0.0002450637,0.027190924,0.004777222,0.011026854,0.0056690997,0.018615924,0.0022880721,0.01377708,-0.018071067,-0.01869376,-0.0029334673,0.002320504,0.011448468,-0.0044820914,-0.005030191,0.015204085,-0.019355372,-0.0061912537,0.026620122,-0.024142323,-0.02169047,0.043303102,0.030096823,-0.049348414,0.0056139654,-0.02750227,0.033340015,-0.008575,0.020938048,-0.016125152,0.03069357,0.023415847,0.018551059,-0.012395481,0.016708925,0.031549774,0.0062334156,-0.008918779,0.0009559309,-0.016890544,0.035208095,0.011253877,0.020938048,0.015359758,-0.007679879,0.029240621,-0.013141415,0.023454767,0.015268949,-0.030953025,0.007323128,0.012946824,0.009995518,0.009152289,-0.012148998,0.0377767,0.021443987,0.0073036687,-0.00943769,-0.0075177196,0.018447278,0.008464731,0.0034572429,-0.013128442,0.028021181,-0.002339963,-0.012693854,-0.001610245,-0.030200606,-0.020561839,0.018045122,0.028617928,0.021651551,0.0064831413,0.024596369,0.03092708,0.024713125,-0.0052572144,0.009781468,-0.002750227,-0.0019815904,0.00036506183,-0.00055904523,0.03655726,-0.013764108,0.03134221,-0.01555435,0.004705872,-0.0048680315,-0.036064297,-0.0319649,0.019018078,-0.0016815951,0.03058979,-0.020756429,-0.020016981,0.029655749,-0.0069923224,0.024661234,0.0015267327,0.02262451,0.015852723,-0.0046474943,-0.010300378,-0.661715,-0.00803663,-0.003030763,0.008626891,0.009359852,0.020886159,0.0054064016,0.0024032055,-0.013212765,0.02323423,-0.0142570725,-0.0065544914,0.01447761,-0.001958888,-0.031238427,-0.0124603445,0.0054031583,-0.0024502317,-0.016670007,-0.005789098,-0.0403972,0.005380456,0.008056089,0.0050139753,-0.010546861,0.010773884,0.011182527,-0.029551968,0.0024518534,0.05925961,-0.035727005,0.022832073,0.011558737,0.00023715843,0.043640394,-0.0015997045,-0.015593268,0.039151818,0.0052896463,0.025686081,-0.013686271,-0.007926362,0.016501362,-0.0064701685,5.5438315e-05,-0.00974255,0.020821294,0.015761914,-0.008497164,-0.007725284,-0.038555067,5.787071e-05,-0.012752231,0.0041934475,0.00742691,-0.0003295894,0.012090621,-0.02351963,0.00742691,0.036323752,-0.009314448,0.010280919,-0.009969573,0.014399773,-0.034455676,0.011759815,0.0018534843,0.005633425,0.022040734,-0.026464447,0.024142323,0.018615924,-0.023869894,0.0038107508,0.0135824885,0.010371729,0.0042875,-0.027606051,-0.013764108,-0.011883056,0.011831165,-0.0056269383,-0.0075501516,-0.013893835,0.026127156,-0.010293892,-0.01663109,0.004015072,0.019277534,0.017318645,0.0023659086,0.029162785,-0.0016556496,-0.010605238,-0.009431203,-0.010670102,-0.008490677,-0.0030015744,-0.0010978206,-0.028825492,-0.02445367,-0.021612633,0.01897916,-0.020159682,0.0044626324,-0.006599896,-0.017111082,0.014866793,0.03186112,-0.0117663015,-0.013880863,-0.012992227,0.0074852877,-0.016514335,0.009392285,-0.024661234,0.022481808,0.016319742,0.0085814865,-0.012752231,0.0036323753,-0.0033048128,0.023117473,-0.005620452,0.022416944,0.019316453,-0.012804123,-0.022715319,-0.03040817,0.009891736,0.0105209155,0.015567322,0.008412841,-0.0077706883,0.012265753,0.013147901,0.0092560705,-0.0096647125,0.0311087,-0.009502553,-0.0073620463,-0.0074852877,-0.0027923884,-0.021560742,-0.0005683694,-0.026697958,-0.0026399584,-0.0016751088,-0.020639675,-0.008133926,-0.023999622,-0.00628855,-0.0064961137,0.012382507,0.025621219,-0.004404255,-0.025128253,-0.0170073,0.010151192,0.004381553,0.014283018,-0.00048647882,-0.043510668,-0.008750132,-0.0050853253,-0.023441793,0.013984645,0.004381553,-0.014075454,-0.045560364,-0.007439883,-0.0070052953,-0.00092998537,-0.0016013262,-0.009937141,0.007375019,-0.003163734,-0.017642966,-0.03564917,-0.016838653,0.021586686,0.010391188,-0.0015940289,-0.013530598,0.035026476,-0.008918779,0.04685764,0.024518533,0.00077309593,0.03912587,-0.013245197,0.018356469,-0.014321936,0.0055750473,-0.0053934287,-0.013083037,0.023363957,0.0066744895,0.01363438,0.0421615,-0.0033275152,-0.025712028,0.015294895,-0.048933282,0.019718608,-0.028799547,0.00028073884,-0.026529312,-0.003950208,0.014179236,0.01897916,0.0069793495,-0.004388039,-0.027580107,0.0037588598,0.015139221,-0.0112149585,-0.0046183057,-0.017733775,-0.01377708,0.010572807,0.00035654844,0.009813899,0.01096199,-0.016384607,0.014192209,0.008017171,0.048466265,-0.020354275,-0.025621219,-0.01262899,-0.0072323186,-0.002091859,0.010183623,0.012966283,0.0001900308,0.012583585,-0.014218154,0.024907716,-0.008996615,-0.007829066,0.01790242,0.021236422,-0.026697958,0.006311252,-0.010812803,0.004692899,0.006901513,0.005698289,0.008166358,0.005448563,0.001866457,-0.013854917,0.0044820914,-0.0020983454,-0.024375832,0.016734872,0.001694568,0.025932565,0.017124055,0.0130311465,-0.010209569,-0.0008780943,-0.015891641,0.01117604,0.01616407,0.010936044,0.0023432064,0.017733775,-0.012706827,-0.005824773,-0.009152289,-0.0070442134,-0.015684078,0.034144327,0.0012121431,-0.01860295,0.0089706695,0.017772693,0.013854917,-0.01616407,-0.046546295,0.002521582,-0.006823676,-0.018161876,-0.02866982,-0.017227838,-0.0035675114,-0.023675302,0.034896746,0.012564126,0.037854537,-0.0073490734,0.00012050486,-1.2972769e-05,-0.017513238,0.03144599,-0.009729576,0.03500053,-0.0031248156,-0.017072164,0.0028880627,0.0024859067,0.00061134173,0.029240621,-0.0029334673,-0.0032091388,-0.033755146,-0.011707923,0.0030502223,-0.02455745,-0.01302466,-0.012635477,-0.0036161593,-0.04091611,0.0040864223,-0.005455049,-0.012583585,0.020587783,0.01340087,-0.014399773,-0.024518533,-0.011630087,0.015294895,0.06782164,-0.0027356327,0.007167455,0.007913389,-0.011111177,0.011467927,0.01799323,-0.041902043,0.012687367,-0.010916585,-0.015294895,-0.003428054,0.035363767,-0.0061847675,0.022611536,-0.002244289,-0.033132453,0.02244289,0.01785053,0.00051728915,0.018823488,-0.003183193,0.0170073,0.022403972,-0.012376022,-0.022559645,0.014308964,0.035052422,0.004436687,-0.044289034,-0.0069988086,-0.02140507,-0.014698147,0.011513332,-0.001324844,-0.025569327,-0.016138125,0.020354275,0.024726097,-0.012382507,-0.012045216,0.03645348,-0.0012632234,-0.013958699,0.0017918637,-0.004488578,-0.0057080183,0.025789864,-0.003312921,0.0010864694,0.036583208,-0.006531789,0.0066939485,0.0021129397,0.004235609,0.0038366963,-0.024077458,-0.01140955,-0.009917682,-0.012142511,-0.008613919,0.00042120958,0.005665857,-0.005231269,-0.009171748,-0.024401778,-0.012661423,-0.02778767,-0.04366634,-0.0053577535,-0.0012883581,-0.03551944,0.00735556,-0.012583585,0.011059285,0.010540375,0.017876476,-0.016060287,-8.467772e-05,0.018680787,0.012654936,0.007491774,-0.0121944025,-0.040604766,0.0016264609,0.006155579,0.012168457,0.010903612,-0.013193306,-0.0052864035,0.008821483,-0.0006980971,0.018356469,-0.002862117,0.013083037,-0.0116236005,-0.010845235,0.006327468,-0.0032010307,-0.01508733,0.027372543,-0.024207186,-0.015152194,-0.0013629515,0.008866888,-0.014568419,0.017266756,0.012025757,-0.03603835,-0.01040416,0.0028750899,-0.01817485,0.004018315,0.0037556165,0.011273336,-0.011163067,0.0040345313,0.022520727,0.0032188683,-0.026204992,0.012161971,-0.009729576,0.0037750758,0.02389584,-0.02089913,0.041824207,-0.003087519,-0.025517436,-0.010663616,-0.022339107,0.008043117,0.018447278,-0.00060769316,-0.035960514,-0.021015886,0.0069988086,-0.002111318,0.02455745,-0.023856921,0.0025475274,-0.013841944,-0.0030842759,0.011026854,0.0049523544,-0.0072517777,-0.023727193,-0.01255764,0.015632186,-0.013336007,0.0014456529,-0.021197503,0.008756619,0.004018315,-0.014192209,-0.002083751,-0.036998335,-0.023999622,0.015567322,0.027294705,0.038918305,0.00628855,-0.0065771937,0.023312066,-0.010462538,0.009755522,0.011111177,-0.005328565,-0.0030858973,-0.015359758,0.0141014,-0.0032496785,-0.014127345,0.017111082,0.04127935,0.023701249,0.018317549,0.013530598,-0.0058799074,-0.012868986,-0.01593056,0.0061912537,0.0064182775,-0.005536129,-0.022040734,-0.014231128,-0.032821104,0.040111803,0.0051923506,-0.0099501135,-0.010254974,0.0251542,-0.011221445,0.03180923,-0.025465544,0.011493873,-0.03250976,0.00398264,-0.021431014,0.0059285555,0.0007698527,0.002746984,0.0047674924,-0.00060850393,0.001469166,0.013841944,0.011513332,-0.010793343,-0.00773177,0.017175946,-0.0047285743,-0.028929275,-0.03725779,-0.021768305,-0.025413655,-0.0013799783,0.026308775,-0.022806127,0.0054615354,-0.010014977,-0.013906808,0.008724187,-0.011902516,0.03967073,0.0022394243,0.024661234,0.013141415,0.010767398,-0.013374925,0.02497258,0.007900416,0.010131733,0.030719517,0.031835176,-0.009969573,-0.02052292,0.015126248,0.0064766547,-0.032405976,-0.04021558,0.02866982,0.0008780943,0.003106978,-0.017422428,0.0033242719,-0.0021891547,0.0036842662,-0.0014245722,-0.02361044,-0.0007864741,-0.01625488,-0.0287736,0.025335817,-0.011111177,0.0013556543,-0.008866888,-0.006625842,-0.0007179617,-0.007945821,-0.021988843,0.012168457,0.005176135,-0.0025605003,-0.016384607,0.0057631526,0.011474414,-0.01756513,0.001830782,-0.011331714,-0.014075454,0.03302867,-0.01785053,0.004164259,-0.011934947,0.011675492,-0.018940242,-0.0031264373,-0.0046377648,-0.014283018,-0.0120711615,0.00017452428,0.041383132,0.016916491,-0.000810798,-0.008140412,-0.015061384,0.017720802,-0.00628855,-0.015139221,-3.8639595e-05,-0.029733585,-0.008685268,0.0064507094,-0.003376163,0.008237708,0.0046961424,-0.027476324,0.019329425,0.026075264,0.0041610156,0.019446181,-0.036894556,0.007465828,-0.03134221,-0.010540375,0.018421331,-0.023805032,0.017435402,-0.014763011,-0.045223072,-0.0014553824,0.0047869515,0.023999622,-0.027476324,0.0026594177,0.030849244,0.0066744895,0.021625606,-0.0071804277,0.003541566,0.018148903,-0.008023658,-0.007407451,0.0064571956,-0.025530409,0.007809607,-0.034170274,-0.0058669345,0.010300378,-0.032924887,-0.028099017,-0.008192304,0.030849244,-0.015100303,-0.012525208,-0.0340146,-0.020717511,-0.018239712,0.029785477,0.0015672726,-0.0050204615,0.0021210476,0.03476702,-0.004300473,0.012233321,-0.015735969,-0.023363957,0.016786763,-0.01087118,-0.038581014,-0.007173941,-0.019524017,0.034222163,-0.01004741,0.006544762,-0.030018987,0.02009482,-0.04156475,-0.018473223,0.021988843,0.019277534,0.026438503,0.0012899797,0.009191207,0.02323423,0.013556544,0.0026610391,-0.028228745,-0.02581581,-0.0023869895,0.006823676,0.021249395,0.0039437218,-0.023636384,0.0061717946,0.030252496,-0.012940337,0.0046150624,0.012914391,-0.03157572,-0.025050417,-0.029500077,0.02497258,0.0072906963,0.0075501516,0.0027032008,0.0043296614,-0.024868798,0.015165167,0.0022653698,-0.0113576595,0.0073490734,-0.00089349947,-0.013699244,0.043873902,-0.015735969,0.008652837,0.0043588504,-0.012233321,0.016293798,0.0015494351,-0.011390091,0.026775794,-0.006314495,-0.030356279,-0.014153291,0.01836944,-0.0060680127,0.009119856,0.012551154,0.001730243,0.033755146,-0.008685268,0.015904615,-0.036323752,0.009139315,0.001830782,0.005720991,-0.021638578,0.020730484,-0.0069988086,0.010060382,0.016825682,-0.005169648,0.019939145,0.003424811,-0.012388994,-0.0020415895,0.0031621123,0.013037632,0.016786763,-0.016215961,-0.0049199224,-0.016008396,-0.008120953,-0.032354087,0.008977156,0.18224145,-0.00071431306,0.026905522,0.018953215,-0.008133926,-0.005922069,0.039151818,-0.0013799783,-0.007835552,0.02351963,-0.010430106,0.018628895,-0.004190204,0.013225738,0.03227625,-0.020250492,0.0053253216,0.0058669345,-0.009294989,-0.040163692,-0.009035533,0.0026513096,-0.0034021086,-0.0013572759,0.0311087,0.00528316,-0.010339296,-0.009450662,0.0103976745,0.018551059,0.0069404314,-0.018213768,-0.022494782,0.011487387,-0.010696048,0.004268041,-0.0282028,0.02333801,0.025076361,0.01705919,0.0014870036,0.014788956,0.0012616018,0.009282016,-0.004057233,0.040163692,-0.006466925,-0.012635477,-0.020860212,0.011740356,-0.021820197,0.021716414,0.034351893,0.011221445,-0.0030080609,-0.017305674,-0.009171748,-0.007109077,-0.010689561,0.014763011,-0.0052053235,0.025750946,-0.0046442514,0.022092625,-0.026127156,0.031783283,-0.029370349,0.01302466,-0.002674012,-0.006256118,-0.016436499,-0.03948911,0.005247485,-0.006463682,0.011163067,-0.008172845,0.01227224,0.007530692,0.010371729,0.01164306,-0.00047472227,-0.012875473,-0.024323942,0.030460062,-0.012804123,-0.014775984,0.040319365,0.0055101835,0.01564516,-0.012966283,-0.019511044,-0.015541377,-3.729671e-05,-0.02272829,0.01164306,0.010488483,0.0025961753,0.022066679,-0.022313163,-0.031705447,-0.010696048,-0.006625842,0.02872171,0.015450568,-0.012012783,-0.012453858,-0.011111177,0.024726097,-0.004242095,-0.025283927,-0.0066420576,-0.020665621,0.015204085,-0.024090432,0.014218154,0.017642966,-0.0036907527,-0.0131543875,0.018330522,-0.004336148,-0.015684078,-0.007394478,0.017604047,0.004452903,-0.00887986,-0.032250304,-0.011993324,0.008056089,0.0033340016,-0.029759532,-0.00086187833,-0.02361044,0.012161971,-0.015619214,0.009567417,0.0033340016,0.010631184,-0.031238427,0.0103976745,0.0062009837,0.013880863,0.007446369,0.012116566,0.013374925,0.013141415,-0.027528215,0.015528404,0.011675492,-0.015671104,-0.019316453,-0.027216868,-0.005633425,0.016241906,-0.031316265,0.029526021,-0.00374913,-0.016916491,-0.0343,-0.002584824,-0.0050561368,-0.0064766547,-0.011772787,0.0194851,-0.016877573,-0.009197693,0.020393193,-0.16439092,-0.00895121,0.027891453,0.004501551,0.012129539,0.0046669533,0.02202776,-0.017824585,-0.0006263415,-0.00032614352,0.015230031,-0.002617256,-0.03323623,-0.021898033,0.0049004634,-0.0016994327,0.009431203,-0.0028799546,0.015074357,0.0016337581,0.039437216,-0.018330522,0.016890544,-0.00046053328,0.00644098,0.024803935,-0.002581581,-0.0016670008,0.012596559,-0.031601664,-0.014801929,0.0025005012,-0.0019248346,-0.00040175044,-0.011701438,0.003661564,0.0016734871,-0.010812803,-0.016708925,0.0046377648,0.018265659,0.0076020425,0.027917398,-0.017098108,0.008458246,0.004096152,0.005380456,-0.00077471754,0.0055815335,-0.032431923,0.0069339448,-0.038217776,0.024609342,0.0053123487,0.018654842,0.015917588,0.007945821,-0.009593363,0.0029934663,-0.022131544,-0.015528404,-0.031264372,0.008146899,-0.019627798,0.008322031,-0.005335051,-0.027424432,0.030018987,-0.01939429,-0.007783661,-0.016319742,-0.024297995,0.03411838,-0.027580107,0.0039891265,0.010287406,-0.0068625947,0.0045177666,-0.017487293,-0.007141509,-0.014775984,0.050048944,-0.017785667,0.012920878,0.027424432,0.001790242,0.004540469,-0.016644062,0.023571521,0.0013102497,0.015022466,-0.011227931,-0.022390999,0.0058993665,0.026620122,0.009262557,-0.025543382,-0.00010895099,-0.004705872,-0.01288196,0.012453858,-0.031056808,-0.012453858,0.010371729,0.017591074,0.012395481,0.006411791,0.0058604483,0.02263748,-0.0028815763,0.015048412,-0.014438692,0.017124055,0.0003391163,-0.020665621,0.0040215584,0.018927269,-0.036271863,0.032042738,-0.0008772835,0.02394773,-0.007764202,-0.01056632,-0.012304671,-0.0015243003,-0.015074357,-0.10315946,-0.0019248346,0.0076539335,0.0130311465,-0.021962898,0.019575909,0.0073815053,0.013608434,0.010683075,0.015476513,0.012226835,-0.00674584,0.0006271523,-0.010812803,0.008328518,-0.0023610438,-0.009158774,-0.010689561,-0.0073815053,-0.0065707075,0.005604236,-0.016670007,0.0055491016,-0.00848419,0.0020367247,0.034896746,-0.015761914,0.012473317,0.02708714,0.015995424,0.0029172513,-0.018538086,-0.0019248346,-0.02656823,-0.0055977497,-0.015917588,-0.009625794,-0.012350076,0.026620122,-0.0251542,-0.015372731,0.0036810231,-0.005909096,-0.042420954,-0.006732867,-0.009723091,-0.014049509,0.010741453,0.0040604766,-0.02534879,-0.0020578054,-0.010825776,-0.042524736,-0.020328328,0.040085856,-0.0074852877,0.0010394431,0.02361044,-0.008030144,0.004044261,0.0052215396,-0.001156198,0.025621219,0.030096823,0.016371634,0.005941528,-0.005584777,0.008944724,0.03256165,-0.012058188,0.007978253,0.03186112,-0.01569705,-0.008665809,-0.031056808,0.020159682,-0.008283113,-0.016734872,0.019238615,-0.009288503,-0.015761914,-0.023013692,-0.007744743,-0.0028394149,-0.0027356327,0.013997617,0.0145424735,-0.0043296614,0.006058283,-0.003693996,0.009067966,0.012187916,-0.0040345313,-0.0026529313,-0.004469119,0.0061166603,0.009087425,0.0011902516,0.015943533,0.020068873,-0.023869894,-0.021703443,-0.053344026,0.017681884,0.00019134834,-0.015126248,0.028617928,-0.013530598,0.009917682,-0.018823488,0.011720897,0.0228191,-0.041590698,0.013044119,-0.0228191,0.0145813925,-0.02052292,0.011448468,0.008944724,0.00051607296,0.033340015,0.027761726,-0.001100253,0.008451759,0.03795832,0.013128442,-0.006447466,0.02220938,-0.021469932,0.01930348,-0.020419138,0.0061134174,0.021962898,-0.028150909,-0.013284115,0.021119667,-0.008892833,-0.006243145,0.0051210006,0.024168268,-0.023805032,0.01836944,-0.011681979,-0.00013753162,3.896898e-05,-0.02131426,0.006616112,0.0013264656,-0.0050139753,0.018667813,0.012239807,-0.011240904,0.03655726,0.007530692,-0.026101211,-0.010125246,-0.016721899,-0.017500265,0.03500053,0.008380408,0.013245197,-0.022092625,0.034844857,0.0033242719,0.017500265,-0.019614827,-0.007842039,0.0067393533,-0.022313163,0.016592171,0.017837558,-0.025102308,-0.029214675,0.003914533,0.0057015317,-0.0041804747,0.014983548,-0.019991037,-0.0140105905,-0.00911337,-0.028617928,0.01433491,0.0030129256,-0.0143868,-0.016423525,-0.0007483666,0.04057882,-0.019926174,-0.014179236,0.0152559765,-0.0050918115,0.0025669867,-0.019640772,0.015735969,-0.012213862,0.008568514,-0.008458246,0.021301286,-0.0028718468,0.008030144,-0.00080714945,0.023701249,-0.00035289986,0.007018268,-0.019614827,-0.018551059,-0.023805032,-0.012434399,-0.010592266,-0.041175567,-0.019666718,0.013323033,0.010897126,-0.004057233,-0.0012916013,-0.0014059239,-0.01248629,0.032120574,-0.003411838,-0.0026691472,-0.0225337,0.005633425,0.002497258,0.0033631902,0.03969667,-0.01377708,0.0011886299,0.0037394005,0.03308056,-0.02201479,0.009995518,-0.0010426863,-0.0022572617,0.0030988702,-0.006256118,-0.018304577,-0.019044025,-0.007679879,0.019848336,0.009690658,-0.02010779,0.034689184,0.0029075218,0.0011829543,0.034559455,-0.023779085,0.019173753,0.02412935,0.03250976,-0.020159682,0.012200889,-0.00233672,-0.020691566,0.01949807,-0.014347882,-0.01827863,0.020081846,-0.0015307867,-0.01738351,0.004540469,-0.024713125,0.019199697,-0.0015591646,0.021521823,0.0013353843,-0.017941339,-0.009528499,0.0199132,-0.014062481,-0.009217152,-0.017811611,0.01227224,0.011039826,-0.034741074,-0.013880863,0.005753423,-0.023727193,0.0048485724,0.005030191,0.024207186,-0.008490677,-0.023986649,-0.020406166,-0.018914297,-0.002481042,0.00064580067,0.012466831,-0.026931468,-0.008665809,-0.02698336] \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/popular.90s.comedy.vector.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/popular.90s.comedy.vector.mdx deleted file mode 100644 index 67ad48d4c..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/_snippets/popular.90s.comedy.vector.mdx +++ /dev/null @@ -1 +0,0 @@ -[0.011615951545536518,-0.041561901569366455,-0.007776559796184301,-0.017821554094552994,-0.019294733181595802,0.02404019609093666,-0.020011767745018005,-0.029985059052705765,-0.009738625958561897,-0.019777102395892143,0.011420396156609058,0.011381285265088081,0.02126331813633442,0.0006441083969548345,-0.00040923902997747064,0.003252727212384343,0.03358326479792595,-0.010207957588136196,0.012528539635241032,-0.028029512614011765,-0.002819247543811798,0.0020549544133245945,-0.01788673922419548,-0.022514868527650833,0.01934688165783882,0.009816847741603851,0.022697385400533676,-0.018251774832606316,-0.010566473938524723,-0.010149290785193443,0.01877325400710106,-0.004126205109059811,-0.00828500185161829,-0.005051830783486366,-0.019829250872135162,-0.010410030372440815,-0.016257116571068764,0.0006098863086663187,0.026621518656611443,0.007385450415313244,-0.0030881352722644806,-0.022228054702281952,-0.009340997785329819,-0.014562309719622135,-0.030480464920401573,0.006016567349433899,0.013806164264678955,-0.016739485785365105,-0.009862476959824562,0.009706033393740654,0.018121404573321342,-0.012671946547925472,-0.018434293568134308,-0.014979492872953415,0.01627015322446823,0.007848263718187809,-0.0029773209244012833,-0.004012131132185459,0.01617889478802681,-0.0035590962506830692,-0.01808229461312294,-0.010951065458357334,-0.022175906226038933,0.019868360832333565,-0.014275495894253254,0.026308629661798477,0.012763205915689468,-0.017169706523418427,0.01797799952328205,-0.012104838155210018,0.02184998244047165,0.001570956315845251,0.003663392039015889,0.01199402380734682,-0.00033468377660028636,-0.017182743176817894,-0.002327101305127144,0.012163504958152771,-0.01229387428611517,0.00793952215462923,0.033557191491127014,-0.02727336622774601,0.001487845554947853,-0.0004795165150426328,0.010455659590661526,-0.0013110315194353461,-0.020024804398417473,0.02434004656970501,-0.016987187787890434,-0.008154632523655891,0.00921062845736742,0.019073104485869408,0.04265700653195381,0.018786290660500526,-0.01969888061285019,0.02829025126993656,-0.011915802024304867,0.012052690610289574,0.006078492850065231,-0.008910777047276497,0.01067728828638792,-0.007672264240682125,-0.02985468879342079,-0.02032465487718582,0.02174568735063076,-0.007561449892818928,-0.006088270805776119,0.010553437285125256,0.014523197896778584,0.011902764439582825,-0.0010103661334142089,0.024300934746861458,0.016635188832879066,-0.034965187311172485,-0.0123916519805789,-0.01603548787534237,0.018681995570659637,-0.00523108895868063,-0.02810773439705372,-0.02727336622774601,0.011818024329841137,-0.003754650941118598,0.035851702094078064,-0.005879679229110479,0.02834239974617958,0.006049159914255142,-0.01842125691473484,-0.024613821879029274,0.013362906873226166,0.011055361479520798,0.042500562965869904,-0.012228689156472683,0.024744192138314247,-0.00845448300242424,-0.03801583871245384,0.01804318279027939,-0.0005760716740041971,0.018238738179206848,-0.011661580763757229,-0.01119224913418293,0.019490288570523262,0.04466470330953598,-0.015696527436375618,-0.004611832555383444,-0.011166175827383995,0.017560815438628197,0.019516361877322197,0.005459236446768045,0.00791996717453003,-0.004644425120204687,0.027351589873433113,-0.0021250280551612377,0.00931492354720831,0.02199338935315609,-0.01035788282752037,0.01852555200457573,-0.00765922712162137,0.008949887938797474,-0.01954243704676628,-0.020311618223786354,0.002983839251101017,0.011022768914699554,0.027586255222558975,-0.022319313138723373,-0.028811730444431305,0.02364908531308174,0.044299665838479996,0.007985151372849941,0.006094789132475853,-0.010755510069429874,0.012502466328442097,0.020063916221261024,-0.043569594621658325,0.017808517441153526,0.0009932550601661205,0.016922002658247948,-0.018316959962248802,0.015096825547516346,-0.0012108096852898598,-0.004862794186919928,0.019033994525671005,0.02149798348546028,0.01464053150266409,0.04393463209271431,-0.014431939460337162,0.011446470394730568,0.013291203416883945,-0.023636048659682274,-0.007913447916507721,-0.007809152361005545,0.020168211311101913,0.025917520746588707,0.008637000806629658,-0.025813225656747818,-0.6378734707832336,-0.014353717677295208,-0.006000271067023277,-0.0254221148788929,0.025630706921219826,-0.000653071328997612,-0.013949571177363396,-0.006593453697860241,-0.01813444308936596,-0.009490923024713993,0.006208862643688917,0.011270470917224884,0.030324021354317665,-0.0015571045223623514,-0.022997237741947174,-0.011094472371041775,0.02169353887438774,-0.011244397610425949,0.0050779045559465885,-0.007326784078031778,-0.03029794618487358,0.016061563044786453,-0.0065804170444607735,-0.017678147181868553,0.021758724004030228,0.02439219318330288,0.015852971002459526,-0.018121404573321342,-0.008734778501093388,0.0032282827887684107,-0.025200486183166504,0.04409107565879822,-0.01295224204659462,-0.0017583629814907908,0.0620560348033905,-0.004879090469330549,-0.009080258198082447,0.031888458877801895,0.004005612805485725,0.03402652218937874,-0.010866324417293072,-0.009093294851481915,-0.0010959213832393289,0.0012735502095893025,0.0009679959621280432,0.0068183415569365025,0.0057493094354867935,-0.005625457968562841,-0.009471368044614792,-0.014992529526352882,0.028811730444431305,0.005648272577673197,0.019998731091618538,0.002716581104323268,0.011974467895925045,0.0035982071422040462,0.012593724764883518,0.0030946535989642143,0.01316735241562128,0.0011545877205207944,-0.003885020734742284,0.015096825547516346,-0.015996377915143967,0.0022065092343837023,-0.005462495610117912,0.022462720051407814,0.015474897809326649,0.006968267261981964,0.02018124796450138,-0.030584760010242462,0.01046217791736126,0.006384862121194601,-0.0280555859208107,0.0009606626117601991,-0.007835226133465767,0.014927344396710396,0.013310759328305721,-0.0005911456537432969,-0.00889774039387703,0.018264811486005783,0.007535375654697418,0.006146937143057585,-0.015227195806801319,-0.006309899501502514,0.02888995222747326,-0.01817355304956436,-0.01475786417722702,0.014405865222215652,0.01808229461312294,0.0027882845606654882,0.02082006074488163,0.0020859171636402607,-0.017274001613259315,-0.027820920571684837,-0.0026008780114352703,-0.018590737134218216,-0.026751887053251266,0.010123216547071934,0.033948302268981934,-0.04443003609776497,0.028081659227609634,-0.02302331104874611,0.02263220213353634,-0.001355031388811767,-0.018004072830080986,0.012372096069157124,0.0030995425768196583,0.027638401836156845,0.02145887352526188,-0.009803811088204384,0.012847946025431156,-0.012763205915689468,-0.008337150327861309,-0.044690776616334915,-0.005928567610681057,-0.019281696528196335,0.010305734351277351,0.02003784105181694,0.010996694676578045,-0.000906070286873728,0.00887166615575552,-0.036868587136268616,-0.014875196851789951,0.007548412773758173,-0.006401158403605223,0.014471050351858139,0.019047031179070473,0.008187225088477135,-0.009797291830182076,-0.0036959846038371325,-0.006544565316289663,0.004536869935691357,0.022905977442860603,-0.005478791892528534,-0.010031958110630512,0.030271872878074646,0.010488252155482769,-0.02823810465633869,0.002866506576538086,-0.008610926568508148,-0.0014226606581360102,-0.005309311207383871,-0.021758724004030228,-0.007053007371723652,-0.0036731697618961334,-0.03358326479792595,0.008304557763040066,0.006225158926099539,-0.018356071785092354,-0.0018724366091191769,-0.01014277245849371,-0.012639353983104229,-0.006665157154202461,-0.00048807202256284654,0.01018840167671442,-0.009999365545809269,-0.006837897468358278,-0.022449683398008347,-0.004536869935691357,-0.015566157177090645,0.02144583687186241,0.01573563739657402,-0.021145984530448914,-0.0022423609625548124,0.01744348183274269,-0.008056854829192162,-0.009275812655687332,0.016922002658247948,-0.013219499960541725,-0.018395181745290756,0.007346339523792267,0.011818024329841137,-0.00828500185161829,0.0021625093650072813,-0.007952558808028698,0.0424223393201828,0.008213299326598644,-0.013428092002868652,0.023922862485051155,-0.0058210124261677265,0.0014259199379011989,-0.015018603764474392,-0.020311618223786354,-0.026021817699074745,0.03629495948553085,-0.025682855397462845,0.033348601311445236,0.02169353887438774,-0.020780950784683228,0.008102484978735447,-0.009295368567109108,0.014810011722147465,-0.01799103617668152,-0.03402652218937874,0.01730007492005825,-0.01651785708963871,0.02149798348546028,0.016061563044786453,0.026360778138041496,0.004680276848375797,0.018095331266522408,-0.003523244522511959,0.01681770756840706,-0.020455025136470795,0.014275495894253254,-0.032566383481025696,-0.006759675219655037,-0.03778117522597313,0.0037742063868790865,0.030324021354317665,0.007652708794921637,0.014705715700984001,0.0021543612238019705,-0.02072880230844021,-0.004611832555383444,0.028550991788506508,-0.03230564296245575,0.027038700878620148,-0.01434068102389574,-0.0051561263389885426,-0.02014213800430298,-0.015722600743174553,0.022215018048882484,0.017365260049700737,0.0008555519743822515,0.013519350439310074,-0.005403829272836447,-0.016882892698049545,-0.003640577429905534,-0.023244939744472504,-0.004748720675706863,0.027090849354863167,0.0045401290990412235,0.039997462183237076,-0.0019180660601705313,-0.03812013566493988,0.021771760657429695,-0.012750168330967426,0.03001113422214985,-0.0055374582298099995,-0.023766418918967247,0.024548638612031937,0.012274319306015968,-0.011498618870973587,0.008474038913846016,-0.0012678465573117137,0.012658909894526005,0.002385767875239253,-0.009047665633261204,0.022905977442860603,-0.01735222339630127,-0.017730295658111572,-0.002615544479340315,0.005866642110049725,0.018551625311374664,-0.0225279051810503,0.028863878920674324,-0.002630211180076003,0.010423067025840282,0.03251423314213753,-0.007522339001297951,-0.03274890035390854,0.020833097398281097,-0.010194920003414154,0.0028599880170077085,1.3049713743384928e-05,-0.0012458466226235032,-0.022358424961566925,-0.03113231435418129,0.019868360832333565,-0.006681453436613083,-0.022658275440335274,0.004276130348443985,-0.014119052328169346,-0.004898645915091038,-0.0029463579412549734,-0.04059716314077377,0.03522592782974243,-0.01473178993910551,-0.017091484740376472,-0.01563134230673313,-0.044795069843530655,0.02512226440012455,0.0069030821323394775,-0.007854782044887543,-0.00035607258905656636,-0.001487845554947853,-0.020246433094143867,-0.00619582599028945,0.01832999661564827,-0.020207323133945465,0.019333845004439354,-0.02101561613380909,0.0261000394821167,-0.0031712460331618786,0.003614503424614668,0.01563134230673313,0.0031924310605973005,0.007215969730168581,0.005446199327707291,-1.573604458826594e-05,0.008363224565982819,-0.04135330766439438,-0.03420904278755188,0.013375943526625633,-0.015970302745699883,-0.005279977805912495,-0.028577065095305443,0.0039045761805027723,-0.00017212890088558197,0.00024444342125207186,-0.005211533512920141,-0.02800343744456768,0.007933003827929497,-0.018004072830080986,-0.005586347077041864,-0.006671675946563482,-0.016504820436239243,0.018238738179206848,-0.020194286480545998,0.0011171064106747508,-0.03199275583028793,-0.023049384355545044,-0.0031223571859300137,0.11921016871929169,0.008304557763040066,0.02507011778652668,0.0156443789601326,-0.015566157177090645,-0.01754777878522873,-0.015540082938969135,0.0002884432324208319,0.014158163219690323,-0.008337150327861309,0.03175808861851692,-0.005899234674870968,0.01975102908909321,-0.006909600459039211,0.011720247566699982,0.005625457968562841,-0.009829884395003319,0.002861617598682642,0.03108016587793827,-0.013714905828237534,-0.028264177963137627,-0.0026790997944772243,0.0072355251759290695,0.014418902806937695,0.0024395452346652746,-0.03577347844839096,0.019033994525671005,0.031106239184737206,0.018538588657975197,-0.01935991831123829,-0.03337467461824417,0.008434928022325039,-0.021628353744745255,0.002825765870511532,0.03689466044306755,0.00887166615575552,0.016152821481227875,-0.0015913266688585281,0.018199628219008446,-0.008617444895207882,0.015109862200915813,0.0030213205609470606,0.007985151372849941,-0.019138289615511894,0.025278707966208458,-0.027742698788642883,-0.012007060460746288,0.027142997831106186,0.004618350882083178,-0.021380651742219925,0.03342682123184204,-0.010058032348752022,-0.013962607830762863,0.02111991122364998,0.01774333231151104,0.004918201360851526,-0.008226335979998112,-0.013245574198663235,-0.026230407878756523,0.007887374609708786,-0.008441446349024773,-0.020415915176272392,0.025474263355135918,-0.004357611294835806,-0.01084676943719387,-0.02394893579185009,-0.003927391022443771,-0.0007162191905081272,-0.0016817707801237702,-0.012932686135172844,-0.014796975068747997,-0.01131610106676817,-0.021145984530448914,0.021667465567588806,0.011648544110357761,-0.004087093751877546,0.013649720698595047,-0.005543976556509733,-0.00789389293640852,0.012756687588989735,-0.002947987522929907,-0.01969888061285019,-0.003251097397878766,-0.04575980827212334,0.011466026306152344,0.010396993719041348,0.00845448300242424,0.00018628625548444688,-0.029098544269800186,0.0032722826581448317,0.03240993991494179,-0.005710198078304529,-0.007887374609708786,0.015657415613532066,0.005983974784612656,-0.004315241239964962,0.0359559990465641,0.03327037766575813,0.027351589873433113,0.003950205631554127,-0.0009557737503200769,-0.03895450383424759,-0.007691819686442614,0.0019962878432124853,-0.03214919939637184,-0.006844415795058012,-0.00896944385021925,0.0248224139213562,-0.019633695483207703,-0.0016271783970296383,0.028994249179959297,0.008832555264234543,0.005035534501075745,-0.01170720998197794,0.021132947877049446,0.030167577788233757,-0.001693178084678948,0.034287262707948685,-0.023049384355545044,0.008813000284135342,0.016974151134490967,-0.04080575332045555,0.016530893743038177,0.017143631353974342,-0.0006881082081235945,-0.0005108867189846933,0.014679642394185066,-0.03274890035390854,-0.0017029558075591922,-0.00023690641683060676,-0.0034059116151183844,0.04091005027294159,-0.0018919920548796654,-0.026856184005737305,-0.03475659340620041,-0.010931509546935558,0.01375401671975851,0.02238449826836586,0.00921062845736742,-0.034287262707948685,-0.04466470330953598,0.012372096069157124,0.015657415613532066,-0.009471368044614792,-0.0029105062130838633,-0.018264811486005783,-0.004830202087759972,-0.02359693869948387,0.003614503424614668,0.00557330995798111,-0.013832238502800465,0.004210945218801498,0.005658050533384085,0.0009492552489973605,-0.010527363047003746,-0.03598207235336304,-0.0010478474432602525,-0.002965913387015462,0.005658050533384085,0.02003784105181694,0.01817355304956436,-0.008767371065914631,0.01053388137370348,-0.005335384979844093,0.013871349394321442,-0.000724367331713438,0.001041329000145197,-0.008760851807892323,-0.013089130632579327,0.012919649481773376,0.012737131677567959,-0.008219817653298378,-0.006290344055742025,0.03924131765961647,-0.01329772174358368,0.010390475392341614,0.011172694154083729,-0.008839073590934277,-0.00872825924307108,-0.022371461614966393,0.004184871446341276,-0.006877008359879255,-0.015266306698322296,0.00654782447963953,-0.0010665880981832743,0.0034776150714606047,0.01084676943719387,0.022201981395483017,-0.012671946547925472,0.008910777047276497,0.03287927061319351,0.007222488056868315,0.023244939744472504,0.008161150850355625,-0.012730613350868225,-0.01114661991596222,-0.018786290660500526,-0.0038524281699210405,-0.001970214070752263,0.006202344316989183,0.017912814393639565,0.02091131918132305,0.0018414737423881888,0.007998188957571983,0.007092118263244629,0.038094062358140945,-0.014575346373021603,-0.05141785740852356,-0.006798786111176014,-0.0205723587423563,-0.01959458366036415,-0.03645140305161476,-0.013597572222352028,-0.03822443261742592,0.008780407719314098,0.00033325786353088915,0.016087636351585388,0.0109445471316576,0.0036568734794855118,0.005977456457912922,-0.0024411750491708517,-0.010696844197809696,0.01911221630871296,0.007372413296252489,0.034182969480752945,-0.011127064004540443,-0.025057079270482063,-0.005051830783486366,0.016348375007510185,-0.004611832555383444,0.011309581808745861,0.0041229454800486565,0.022736497223377228,0.028577065095305443,-0.005850345827639103,0.012632835656404495,-0.017873702570796013,-0.03436548635363579,-0.04208337888121605,0.029385358095169067,0.003907835576683283,-0.0011228100629523396,-0.027977364137768745,-0.01587904430925846,-0.008298039436340332,0.013454166240990162,0.010735955089330673,-0.0018561403267085552,0.008584853261709213,-0.005563532002270222,-0.0027442846912890673,0.01920347474515438,-0.003119098022580147,-0.017078446224331856,-0.032383862882852554,-0.008082929067313671,0.0054559772834181786,-0.025787150487303734,0.003233171533793211,0.018994882702827454,-0.006632564589381218,0.028655286878347397,-0.004510795697569847,-0.002770358696579933,0.011459507048130035,-0.0005630346713587642,-0.010690325871109962,0.014744826592504978,0.008461001329123974,0.02267131209373474,0.007346339523792267,0.0010649585165083408,0.008447964675724506,-0.016504820436239243,0.032227419316768646,-0.007854782044887543,0.001358290552161634,-0.03371363505721092,-0.004699832294136286,-0.002092435723170638,0.022788645699620247,0.012593724764883518,-0.002015843288972974,0.018734144046902657,-0.004514055326581001,0.03063690848648548,-0.021915167570114136,-0.034678373485803604,-0.004145760554820299,-0.018981846049427986,-0.0012523650657385588,-0.005638495087623596,0.021471910178661346,-0.0011716987937688828,0.0038263543974608183,-0.002449322957545519,0.016100673004984856,-0.006567379925400019,-0.010247068479657173,0.005847086664289236,-0.01612674631178379,0.018316959962248802,-0.006877008359879255,0.008148114196956158,0.007496264763176441,-0.03152342513203621,0.0384330227971077,0.003979539033025503,-0.008519668132066727,-0.014705715700984001,-0.0016703633591532707,-0.0019816213753074408,-0.0012784390710294247,-0.010566473938524723,0.012958760373294353,0.03152342513203621,-0.009477886371314526,-0.04456040635704994,-0.02092435583472252,0.032331716269254684,-0.01133565604686737,0.013532388024032116,0.013336832635104656,0.016452671959996223,0.02570892870426178,-0.011537729762494564,-0.0022276942618191242,0.000284980284050107,-0.02458774857223034,-0.010696844197809696,-0.00967344082891941,0.01993354596197605,0.023779455572366714,-0.021823909133672714,-0.023766418918967247,-0.0031451720278710127,-0.003624281147494912,0.043230634182691574,-0.0015155491419136524,0.04171834513545036,0.001864288467913866,0.05376451462507248,-0.004386944696307182,0.03650354966521263,-0.0003338689566589892,-0.03040224313735962,-0.0006962563493289053,-0.015435786917805672,0.002428137930110097,0.011511655524373055,0.004253315273672342,0.019020957872271538,0.0035427999682724476,-0.01647874526679516,-0.0312887579202652,-0.014458013698458672,-0.024887599050998688,-0.013082611374557018,-0.018747180700302124,0.015096825547516346,0.032175272703170776,0.006189307197928429,-0.004149019718170166,-0.012215652503073215,0.007906929589807987,0.016283191740512848,-0.03676429018378258,0.008043818175792694,0.010025439783930779,0.0159833412617445,0.012828391045331955,-0.001454438315704465,0.012117874808609486,-0.011863653548061848,0.03147127479314804,0.015396676026284695,0.00791996717453003,0.004699832294136286,-0.009803811088204384,-0.0016752522205933928,-0.01464053150266409,0.0011032546171918511,-0.005423384718596935,-0.006759675219655037,0.0030865054577589035,-0.02355782687664032,-0.0004734054091386497,-0.020207323133945465,0.010207957588136196,-0.00012823093857150525,-0.0010486622340977192,0.01194839458912611,0.02423574961721897,0.00786781869828701,0.011889727786183357,-0.01503164041787386,-0.025721965357661247,-0.003627540310844779,-0.0006730342283844948,-0.011531210504472256,0.004126205109059811,0.010742473416030407,-0.0052506448701024055,-0.012482910417020321,-0.0024916932452470064,-0.012313430197536945,-0.0008058484527282417,-0.02649114839732647,0.014823049306869507,-0.014327643439173698,0.0020631025545299053,-0.010924991220235825,0.02888995222747326,-0.034965187311172485,0.0060817524790763855,0.005407088436186314,0.0011912542395293713,-0.013727942481637001,0.006463083904236555,0.017912814393639565,-0.005804716609418392,0.013375943526625633,0.0037122806534171104,0.003225023625418544,-0.014523197896778584,-0.010807658545672894,-0.0026236926205456257,-0.037598658353090286,-0.010866324417293072,-0.004592277109622955,-0.010827213525772095,-0.011074916459619999,-0.003937168512493372,0.0018414737423881888,-0.00284043257124722,0.018160516396164894,0.2244446873664856,-0.004338055849075317,-0.009217146784067154,0.03201882913708687,0.014927344396710396,0.017026299610733986,0.008858629502356052,0.008702185936272144,0.014875196851789951,0.022984199225902557,-0.030715130269527435,0.011413877829909325,-0.011231360025703907,0.0019245845032855868,-0.008369742892682552,-0.013558461330831051,-0.019829250872135162,-0.03293141722679138,-0.04372603818774223,-0.027012627571821213,0.003549318527802825,-0.0012042912421748042,-0.018290886655449867,-0.004279389511793852,0.015292380005121231,0.014875196851789951,-0.007039970252662897,-0.020989540964365005,0.027690550312399864,0.02761232852935791,0.0012898464919999242,-0.024509526789188385,0.007600560784339905,-0.004100130870938301,-0.014653568156063557,-0.005589606240391731,-0.01578778587281704,-0.025187449529767036,-0.0036438365932554007,0.009008554741740227,-0.006270788609981537,-0.003588429419323802,0.016348375007510185,-0.01278276089578867,-0.004386944696307182,0.008819518610835075,0.008975962176918983,0.019073104485869408,-0.008878185413777828,0.018264811486005783,-0.012033134698867798,-0.00619582599028945,0.016087636351585388,0.045081883668899536,-0.0114334337413311,-0.0028078400064259768,0.016152821481227875,-0.0070790816098451614,-0.02967217192053795,0.014940381981432438,0.019959619268774986,0.03908487409353256,-0.021576205268502235,0.003591688582673669,0.0017991035711020231,0.016687337309122086,-0.031497348099946976,-0.0014055497013032436,0.010507808066904545,-0.011589877307415009,0.007365894969552755,-0.0036568734794855118,-0.02766447700560093,0.002648137044161558,-0.0053712367080152035,-0.02727336622774601,0.02844669483602047,0.0280555859208107,-0.002059843158349395,0.022462720051407814,-0.015396676026284695,-0.00175021484028548,-0.012143949046730995,0.017274001613259315,-0.025787150487303734,-0.0248224139213562,0.014092978090047836,-0.0024444342125207186,-0.003114209044724703,-0.008402335457503796,0.0060752336867153645,-0.02946357987821102,0.005873160436749458,0.012332985177636147,0.016152821481227875,0.023440495133399963,-0.02677796222269535,0.019764065742492676,-0.010553437285125256,0.005397310480475426,-0.030141502618789673,-0.03433941304683685,0.021250281482934952,-0.02810773439705372,-0.009692996740341187,-0.02316671796143055,0.009354034438729286,0.015748674049973488,0.014092978090047836,-0.003692725207656622,-0.006071974523365498,-0.017678147181868553,0.011648544110357761,0.012548095546662807,0.01715666800737381,0.01920347474515438,0.008975962176918983,-0.010273141786456108,-0.002178805647417903,-0.027846993878483772,-0.03134090453386307,-0.030949795618653297,0.0030571722891181707,0.009719070047140121,-0.016948077827692032,0.005006201099604368,-0.007724412251263857,0.015005567111074924,-0.007131229154765606,-0.016113709658384323,0.018264811486005783,0.0017404371174052358,-0.0005227015353739262,-0.012678464874625206,0.0020076953805983067,0.014692679047584534,0.01063165906816721,-0.013988682068884373,-0.012235207483172417,-0.006837897468358278,0.005234348587691784,0.0029219137504696846,-0.016674300655722618,-0.008187225088477135,0.0024786563590168953,-0.04521225392818451,0.005628717131912708,0.00019402695761527866,0.005899234674870968,7.363857730524614e-05,-0.008930332958698273,-0.016987187787890434,-0.004527091979980469,0.010625140741467476,0.005162645131349564,-0.047871798276901245,-0.018434293568134308,-0.032383862882852554,-0.014275495894253254,0.04315241053700447,-0.01969888061285019,0.007750486023724079,0.016113709658384323,6.951359682716429e-05,-0.02184998244047165,-0.005527680739760399,-0.1651003509759903,0.02165442705154419,0.005504865664988756,-0.019333845004439354,0.008369742892682552,-0.00133466103579849,0.014810011722147465,-0.0008274409919977188,0.0036470957566052675,-0.007111673709005117,0.0032902085222303867,0.011250915937125683,-0.031210536137223244,-0.021589241921901703,0.007509301882237196,-0.00026888775755651295,-0.014079940505325794,-2.4332941848115297e-06,-0.01119224913418293,0.007574486546218395,-0.0026188038755208254,0.015292380005121231,0.022371461614966393,0.012593724764883518,0.013636683113873005,0.016987187787890434,-0.003940428141504526,0.027742698788642883,-0.011576840654015541,0.00325598637573421,-0.011322619393467903,-0.03314000740647316,0.017026299610733986,0.009764700196683407,0.026034854352474213,0.004162056837230921,0.0009565885993652046,0.0018170294351875782,-0.0005198496510274708,-0.003933909349143505,0.019620658829808235,0.027846993878483772,0.01548793539404869,-0.007300710305571556,-0.0042467969469726086,0.033296454697847366,0.01877325400710106,-0.01226780004799366,0.0039045761805027723,-0.00477805407717824,-0.005599383730441332,-0.027403736487030983,0.04184871166944504,-0.0074506355449557304,0.0038263543974608183,0.008741296827793121,-0.012737131677567959,0.0248224139213562,0.008350186981260777,-0.01657000370323658,-0.005553754512220621,-0.023675160482525826,0.009158479981124401,-0.019568510353565216,-0.023101532831788063,-0.030897649005055428,-0.039945315569639206,0.0041685751639306545,-0.00957566313445568,0.0028730249032378197,-0.01187669113278389,-0.020793987438082695,0.016869856044650078,-0.030376167967915535,0.02888995222747326,0.0017534741200506687,-0.03632103279232979,0.028212029486894608,0.013402017764747143,0.012522021308541298,0.015970302745699883,0.04784572497010231,0.006049159914255142,0.016309265047311783,0.007209451403468847,0.009327961131930351,-0.008936851285398006,-0.002477026544511318,-0.014158163219690323,-0.015579193830490112,0.0021478428971022367,-0.019920509308576584,-0.0039013170171529055,-0.009745144285261631,-0.014405865222215652,0.024639897048473358,0.006498935632407665,0.030819425359368324,0.016309265047311783,-0.01444497611373663,-0.01299787126481533,-0.010755510069429874,-0.015096825547516346,0.02263220213353634,0.020050879567861557,0.014119052328169346,0.015722600743174553,0.007144266273826361,0.0029512469191104174,-0.011074916459619999,0.009966772980988026,-0.0064467876218259335,0.010540400631725788,0.011407359503209591,-0.00011865690612467006,0.002752432832494378,0.020259469747543335,-0.012287355959415436,0.0009500700980424881,-0.0012637724867090583,0.05222615227103233,0.0004167760198470205,-0.030871573835611343,-0.02268434874713421,0.006759675219655037,-0.005814494099467993,-0.07816974818706512,-0.0018268071580678225,0.005237607751041651,0.030506538227200508,-0.012743650004267693,0.041327234357595444,0.004406500142067671,0.023349234834313393,-0.027377663180232048,0.04500366374850273,-0.00960173737257719,-0.025630706921219826,-0.005511384457349777,-0.02473115548491478,0.020311618223786354,-0.01647874526679516,-0.02053324691951275,-0.026308629661798477,-0.013988682068884373,0.01671341061592102,0.0067857494577765465,-0.003286949126049876,-0.009745144285261631,-0.006368565838783979,-0.04422144591808319,-0.02126331813633442,-0.019959619268774986,0.027064776048064232,0.009764700196683407,0.002232583239674568,0.0004811461258213967,-0.019086143001914024,0.0014601419679820538,-0.02585233561694622,-0.002486804500222206,0.01876021735370159,-0.017221853137016296,-0.010370919480919838,0.01112054567784071,-0.044247519224882126,0.007255080621689558,0.018551625311374664,-0.0006151826237328351,-0.023636048659682274,0.0008987369947135448,-0.03832872956991196,-0.01065773330628872,0.013193425722420216,0.007326784078031778,-0.004266352392733097,-0.020611468702554703,-0.0018789550522342324,-0.021863019093871117,-0.00502249738201499,0.013141278177499771,-0.012378614395856857,0.03822443261742592,-0.007385450415313244,-0.0016736226389184594,-0.007202932611107826,-0.006606490816920996,-0.006599972490221262,-0.013949571177363396,0.02668670378625393,0.0052506448701024055,-0.03191453218460083,-0.009855958633124828,-0.02111991122364998,0.005227829795330763,-0.02933320961892605,-0.015813859179615974,0.010129734873771667,-0.033113934099674225,0.003198949620127678,0.012130912393331528,0.006626046262681484,0.0005447014118544757,-0.000984292128123343,0.02370123378932476,-0.0261391494423151,-0.012319948524236679,-0.021054726094007492,-0.01847340352833271,-0.01478393841534853,0.026073964312672615,0.018825402483344078,0.01866895891726017,-0.010983658023178577,-0.0016377709107473493,-0.023779455572366714,0.01975102908909321,0.02815988101065159,0.011798469349741936,0.014601420611143112,-0.009288850240409374,-0.0011179212015122175,-0.012482910417020321,-0.007737448904663324,0.022371461614966393,0.02282775565981865,-0.011909283697605133,-0.005478791892528534,-0.0751451626420021,0.004823683295398951,0.02131546661257744,-0.010025439783930779,0.010031958110630512,-0.034182969480752945,-0.0036959846038371325,-0.012724095024168491,0.025057079270482063,-0.007300710305571556,-0.031549498438835144,0.0192556232213974,-0.010344845242798328,0.0028730249032378197,-0.019242586567997932,-0.01559223048388958,0.00758752366527915,-0.00280458084307611,0.018681995570659637,0.004210945218801498,0.02096346765756607,0.002835543593391776,0.002139694755896926,0.0280555859208107,0.006890045013278723,-0.0032266532070934772,-0.00779611524194479,0.02590448409318924,-0.009979809634387493,-0.016596078872680664,0.025721965357661247,-0.029489653185009956,-0.009543071500957012,0.014914307743310928,0.019724953919649124,-0.023127606138586998,-0.023349234834313393,0.018838439136743546,0.010566473938524723,-0.006714046001434326,-0.02012910135090351,-0.017730295658111572,0.03592992201447487,-0.030324021354317665,-0.006156715098768473,-0.011961431242525578,-0.024861525744199753,0.009471368044614792,0.021589241921901703,-0.02076791226863861,0.01881236582994461,-0.0036470957566052675,-0.028029512614011765,-0.014810011722147465,-0.01302394550293684,-0.012991352938115597,0.003627540310844779,0.021954277530312538,-0.0036862066481262445,-0.012717576697468758,0.03074120357632637,0.01671341061592102,0.013102167285978794,-0.0014055497013032436,0.00045996104017831385,-0.01715666800737381,-0.018851475790143013,-0.00034690595930442214,0.008441446349024773,-0.025278707966208458,-0.010781584307551384,0.0015880673890933394,0.01514897309243679,0.03645140305161476,0.003461318789049983,-0.007672264240682125,-0.014510161243379116,-0.002375989919528365,-0.016439635306596756,0.005055089946836233,0.025539448484778404,0.0074506355449557304,-0.021915167570114136,0.03175808861851692,0.0170002244412899,0.03793761879205704,-0.027012627571821213,-0.017873702570796013,0.0034939113538712263,0.008734778501093388,-0.0069291559047997,-0.005087682511657476,0.018486440181732178,-0.0026367297396063805,0.01285446435213089,0.02800343744456768,-0.02101561613380909,0.009158479981124401,-0.0007418858003802598,0.004344574175775051,-0.0028713953215628862,-0.013845275156199932,-0.028707435354590416,-0.00852618645876646,0.0076070791110396385,0.005159385967999697,-0.008741296827793121,-0.0029545060824602842,-0.019659768790006638,0.012261281721293926,0.009367072023451328,-0.0056156800128519535,-0.0074897464364767075,0.0257610771805048,-0.004765016958117485,0.010853287763893604,-0.012078763917088509,-0.02238449826836586,-0.0047845724038779736,-0.010807658545672894,0.02085917256772518,-0.00658367620781064,0.04197908192873001,-8.484223508276045e-05,0.01808229461312294,0.027925215661525726,0.002375989919528365,-0.016869856044650078,0.0019278437830507755,-0.006127381697297096,-0.003298356430605054,-0.014353717677295208,-0.022006426006555557,0.0035427999682724476,0.029307136312127113,-0.016087636351585388,-0.004279389511793852,0.05616331845521927,-0.02400108426809311,0.06210818514227867,0.0059937527403235435,-0.01173328422009945,0.0053223478607833385,-0.005935086403042078,0.019320808351039886,-0.002861617598682642,-0.006961748469620943,-0.01063165906816721,-0.008180706761777401,-0.002651396207511425,-0.011818024329841137,0.009908106178045273,-0.0004110723384656012,0.019920509308576584,-0.005002941936254501,-0.003303245408460498,0.001389253418892622,0.014392828568816185,-0.014862160198390484,0.012652391567826271,-0.009217146784067154,0.019633695483207703,-0.008115521632134914,-0.0043836855329573154,-0.01520112156867981,0.011831061914563179,-0.012352541089057922,-0.021641390398144722,-0.033348601311445236,0.006065456196665764,-0.03259245678782463,-0.002398804761469364,-0.014418902806937695,0.01939903013408184,-0.0234665684401989,-0.009679959155619144,-0.006714046001434326,0.001711103948764503,0.016348375007510185,-0.005882938392460346,0.008669593371450901,-0.014314606785774231,-0.02380552887916565,0.0397888720035553,0.00478783156722784,0.003950205631554127,-0.008298039436340332,-0.020063916221261024] \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/images/search-conceptual-dark.png b/docs/academy/py/zero_to_mvp/102_queries_1/images/search-conceptual-dark.png deleted file mode 100644 index 0145b4ea7..000000000 Binary files a/docs/academy/py/zero_to_mvp/102_queries_1/images/search-conceptual-dark.png and /dev/null differ diff --git a/docs/academy/py/zero_to_mvp/102_queries_1/index.mdx b/docs/academy/py/zero_to_mvp/102_queries_1/index.mdx deleted file mode 100644 index aa55801e7..000000000 --- a/docs/academy/py/zero_to_mvp/102_queries_1/index.mdx +++ /dev/null @@ -1,48 +0,0 @@ ---- -title: P3_102 Queries 1 (Python) -description: Begin querying Weaviate with basic search and filter techniques. -sidebar_position: 102 ---- - -import ZeroToMvpDeprecationNote from '/docs/academy/py/zero_to_mvp/_snippets/deprecation.md'; - - - -## Unit overview - -import ReactPlayer from 'react-player/lazy' - - -
- -In this unit, you will learn how to efficiently retrieve relevant objects or aggregated information from Weaviate. - -You have already encountered some examples of vector searches. In this section, we will delve deeper by reviewing the various vector search methods available in Weaviate, such as `nearVector`, `nearObject`, and `nearText`. - -Along with vector search methods, you will also discover filters that can be employed to accompany search operators. For instance, you will learn how to search for data objects that exclude specific criteria. - -As we examine these capabilities, we will simultaneously use them as a means to gain insight into the inner workings of Weaviate. - -Upon completing this unit, you will possess a thorough understanding of how to effectively query Weaviate to obtain desired results, as well as the underlying mechanisms that make it all possible. - -### Prerequisites - -- (**Required**) A Python (3) environment with `weaviate-client` installed. -- (**Required**) Complete [101A Weaviate Academy Preparation](../setup.mdx) -- (*Recommended*) Complete [Hello, Weaviate](../101_hello_weaviate/index.mdx) - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/05_preparation.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/05_preparation.mdx deleted file mode 100644 index 416321d50..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/05_preparation.mdx +++ /dev/null @@ -1,108 +0,0 @@ ---- -title: Preparation ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/05_create_instance.py'; - -## Create Weaviate Instance - -For this unit, you will require your own instance of Weaviate. - -If you do not have one, we recommend setting one up on the [Weaviate Cloud (WCD)](https://console.weaviate.cloud/). **A free sandbox instance should be just fine.** - -Follow the "Get Started with WCD" [section on this page](../101_hello_weaviate/40_set_up.mdx#--get-started-with-wcd) and come back. - -## Access your Weaviate Instance - -Now, let's make sure that you can access your Weaviate instance by following the example below. - -:::tip What is your Weaviate instance address? -Make sure you know what your Weaviate instance address is - it should look like: -``` -https://your-instance-name.weaviate.network -``` -You can find it in your WCD dashboard. -::: - -The code to access depends on whether you have authentication turned on, or off. If you are not sure, go to your WCD dashboard and check what it says about "Authentication". - -### With API key authentication (Recommended) - -If authentication is on, you can instantiate your client as shown below. - - - - - - - -### Without Authentication - -If authentication is switched off in your instance, you do not need the authentication parameter. - - - - - - - -## Confirm access - -Once your client is instantiated, you can confirm that you can write to your Weaviate instance by running the following code: - - - - - - - -You should see a long alphanumeric string printed on your screen, like `"59340403-4bcd-479f-ae9c-de5da039ac0e"`. - -If you do, you can access your Weaviate instance and have write access! Which means you are ready to move on. - -Let's do one more thing before we move on, though, which is to clean up the test data that we've just created. - - - - - - - -We'll see you in the next section. - -## Review - -### Key takeaways - -- You need to create a Weaviate instance (e.g. on WCD) for this unit. -- Access your Weaviate instance with API key authentication or without authentication as appropriate. -- Confirm access to your Weaviate instance by running the provided code snippet. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/10_data_structure.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/10_data_structure.mdx deleted file mode 100644 index 7e6def43f..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/10_data_structure.mdx +++ /dev/null @@ -1,311 +0,0 @@ ---- -title: Data structure in Weaviate -description: Establish data structure in Weaviate for efficient organization and search. ---- - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -You've seen some of the powerful queries that Weaviate enables. But how does Weaviate actually store data such that it can supports these queries? - -In this section, we'll take a look at some of the key components that allow Weaviate to perform these queries at speed. In particular, we'll take a look at indexes, which are the backbone of Weaviate's data structure, and the schema, which acts as a blueprint for your data. - -## Indexes - -An index is a data structure that allows for efficient retrieval of data. In Weaviate, there are two main indexes: the inverted index and the vector index. - -The **inverted index** is the kind of index that you may be familiar with. You can think of it as a reference table that for example allows you to quickly look up a term and find objects that contain that term. - -The **vector index** allows for efficient retrieval of vectors based on similarity. This is the index that allows Weaviate to perform vector searches fast. Let's dig in a little more. - -### Inverted index - -An inverted index deconstructs text into a set of constituent terms and stores them in a data structure, such as a hash table. Take, for example, a data object containing the text "Conference (Chicago, IL)". - -The user might want to search for this object based on any of the contained terms such as "conference", "Chicago", or "IL". The inverted index allows Weaviate to quickly retrieve the ID of the object containing the term. - -This is done by mapping the object ID to "tokens", where a token is a term that has been extracted from the text. By default, Weaviate uses a `word` tokenization, where only alpha-numeric characters are kept, converted into lowercase, and then split into tokens based on whitespace. - -So an input text `Conference (Chicago, IL)` is indexed by three tokens: `conference`, `chicago`, `il`. - -We will cover more about different available tokenization methods later on. - -### Vector index - -Each object in Weaviate can be associated with a vector. These vectors are what enables similarity searches that you have seen before. As we mentioned, however, brute-force similarity searches are computationally expensive, as well as growing linearly with the size of the dataset. - -To tackle this problem Weaviate uses vector indexes that utilize an Approximate Nearest Neighbor (ANN) algorithm. The ANN algorithm enables each vector index to organize a set of vectors, so that similar ones to a query can be retrieved at lightning-fast speeds. Weaviate currently uses an HNSW-based ANN index. - -import HNSWLayersImg from '/docs/weaviate/concepts/img/hnsw-layers.svg'; - - - -Each set of vectors are said to reside in a "vector space", indicating that it is a multi-dimensional "space" in which vectors are placed. - -## Classes - -### What is a `class`? - -A class in Weaviate is a collection of objects of the same type. Each object in Weaviate must belong to a class, and one class only. - -Imagine that you are storing a set of quiz items from the game show *Jeopardy!* in Weaviate. A good way to structure it would be to have an object represent a question including all associated attributes, such as the answer, what round it was from, how many points it was worth, when it aired on TV, and so on. - -So, a good way to represent this data would be through a class called `JeopardyQuestion`, which would contain a set of objects, each object representing one such question. - -:::note Class names are singular by convention -This is as they refer to individual objects, e.g. a `JeopardyQuestion` object. -::: - -### What is in a class? - -As we mentioned, each Jeopardy! question would contain multiple related, but distinct, attributes such as the question, answer, round, points, and so on. These are reflected in each `class` object in Weaviate as a set of `properties`, such as a `question` property, an `answer` property, and so on. - -### How many vectors per object? - -Each object is represented by one vector, and each class has one vector index. This means that all objects in the class will be associated with the same vector index. - -In other words, all objects in the class will be stored in what is called the same vector space. This is important to keep in mind when designing your data schema in Weaviate. A vector search can only be performed within a single vector space, for reasons that a vector of different lengths, or even those of the same length but with different meanings, cannot be compared. - -Going back to our [color analogy](../101_hello_weaviate/15_overview_vectors.mdx#-how-do-numbers-represent-meaning) that you saw earlier - you wouldn't be able to compare an RGB value to an CMYK value, right? The same applies to vector embeddings that represent text. - -So in Weaviate, a vector search can only search one class at a time. As a result, it is important to design your schema such that objects that you want to search together are in the same class. - -## Schema - -A `schema` in Weaviate is the blueprint that defines its data structure. It does so for each `class` of objects, which are collections of objects of the same type. - -Here is an example schema structure: - -
- Example schema - -```json -{ - "classes": [ - { - "class": "Article", - "invertedIndexConfig": { - "bm25": { - "b": 0.75, - "k1": 1.2 - }, - "cleanupIntervalSeconds": 60, - "stopwords": { - "additions": null, - "preset": "en", - "removals": null - } - }, - "moduleConfig": { - "text2vec-openai": { - "model": "ada", - "modelVersion": "002", - "type": "text", - "vectorizeClassName": true - } - }, - "properties": [ - { - "dataType": [ - "text" - ], - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - }, - "name": "title", - "tokenization": "word" - }, - { - "dataType": [ - "text" - ], - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - }, - "name": "body", - "tokenization": "word" - } - ], - "replicationConfig": { - "factor": 1 - }, - "shardingConfig": { - "virtualPerPhysical": 128, - "desiredCount": 1, - "actualCount": 1, - "desiredVirtualCount": 128, - "actualVirtualCount": 128, - "key": "_id", - "strategy": "hash", - "function": "murmur3" - }, - "vectorIndexConfig": { - "skip": false, - "cleanupIntervalSeconds": 300, - "maxConnections": 32, - "efConstruction": 128, - "ef": -1, - "dynamicEfMin": 100, - "dynamicEfMax": 500, - "dynamicEfFactor": 8, - "vectorCacheMaxObjects": 1000000000000, - "flatSearchCutoff": 40000, - "distance": "cosine", - "pq": { - "enabled": false, - "segments": 0, - "centroids": 256, - "encoder": { - "type": "kmeans", - "distribution": "log-normal" - } - } - }, - "vectorIndexType": "hnsw", - "vectorizer": "text2vec-openai" - } - ] -} -``` - -
- -This is a lot of information, and can be quite intimidating. Let's break it down. - -First of all, you see that the first level key in the object is `classes`, which contains a list of classes. In this case, there is only one class, `Article`. - -The schema specifies for each class: -- The metadata such as its name (`class`), -- Its data `properties`, -- The `vectorizer`, -- Module configurations (`moduleConfig`), -- The index configurations (for inverted `invertedIndexConfig` and vector `vectorIndexConfig` indexes), -- and more. - -:::info Auto-schema -Any missing information required for schema definition will be automatically inferred by Weaviate based on default values and the imported data. -::: - -## Review - -### Review exercise - - - - - - - - - -### Key takeaways - -- Weaviate stores data using two main indexes: the inverted index and the vector index. -- A class in Weaviate represents a collection of objects of the same type, and each object in Weaviate must belong to a single class. -- A schema is the blueprint that defines Weaviate's data structure. -- Vector searches can only be performed within a single vector space. - - So, any objects you want to search together should be in the same class. -- Any missing information required for schema definition will be automatically inferred by Weaviate based on default values and the imported data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const invertedIndex = [ -{ -questionText: 'What is the function of an inverted index in Weaviate?', -answerOptions: [ -{ -answerText: 'It allows for efficient retrieval of vectors based on similarity.', -isCorrect: false, -feedback: 'That is the function of the vector index.', -}, -{ -answerText: 'It deconstructs text into a set of constituent terms and stores them for fast retrieval.', -isCorrect: true, -feedback: 'This allows objects containing those terms to be retrieved quickly.', -}, -{ -answerText: 'It acts as a blueprint for your data.', -isCorrect: false, -feedback: 'That is the function of the schema.', -}, -] -}]; -export const vectorIndex = [ -{ -questionText: 'What does the vector index in Weaviate enable?', -answerOptions: [ -{ -answerText: 'It enables efficient retrieval of data based on a reference table.', -isCorrect: false, -feedback: 'That is the function of the inverted index, not the vector index.', -}, -{ -answerText: 'It enables similarity searches by associating each object with a vector.', -isCorrect: true, -feedback: 'Correct! The vector index uses an Approximate Nearest Neighbor (ANN) algorithm to allow fast similarity searches.', -}, -{ -answerText: 'It defines the data structure of Weaviate.', -isCorrect: false, -feedback: 'That is the function of the schema, not the vector index.', -}, -] -}]; -export const classDefinition = [ -{ -questionText: 'What is a class in Weaviate?', -answerOptions: [ -{ -answerText: 'It is a type of index used for efficient data retrieval.', -isCorrect: false, -feedback: 'A class is not a type of index. It is a collection of objects of the same type in Weaviate.', -}, -{ -answerText: 'It is a collection of objects of the same type.', -isCorrect: true, -feedback: 'Correct! A class in Weaviate is a collection of objects of the same type.', -}, -{ -answerText: 'It is a specific object within a collection.', -isCorrect: false, -feedback: 'A class is not a specific object, but rather a collection of objects of the same type.', -}, -] -}]; -export const schemaRole = [ -{ -questionText: 'What is the function of the schema in Weaviate?', -answerOptions: [ -{ -answerText: 'It allows for efficient retrieval of vectors based on similarity.', -isCorrect: false, -feedback: 'That is the function of the vector index, not the schema.', -}, -{ -answerText: 'It deconstructs text into a set of constituent terms and stores them in a data structure.', -isCorrect: false, -feedback: 'That is the function of the inverted index, not the schema.', -}, -{ -answerText: 'It acts as a blueprint that defines the data structure of Weaviate.', -isCorrect: true, -feedback: 'The schema defines the data structure for each class of objects in Weaviate.', -}, -] -}, -] diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/20_schema.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/20_schema.mdx deleted file mode 100644 index 1899959fd..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/20_schema.mdx +++ /dev/null @@ -1,249 +0,0 @@ ---- -title: How to define a schema -description: Design your Weaviate schema for structured and optimized search capabilities. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/20_schema.py'; - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -A `schema` in Weaviate is the blueprint that defines its data structure for each `class` of objects. A class is a collection of objects of the same type. - -In this section, you will learn how to define a schema and gain insight into some key considerations while doing so. - -## How to define a schema - -As you learned earlier, a schema definition includes a great deal of information. Let's cover a few of those properties in this section, starting with: -- The metadata such as its name (`class`), -- Its data `properties`, -- The `vectorizer`, and -- Module configurations (`moduleConfig`). - -### Metadata definition - -You can define for each class and property a name and description. - -For classes, these are called: -- `class` (required), and -- `description` (optional). - -For properties, these are called: -- `name` (required), and -- `description` (optional). - -In defining a class, the only required parameter is `class`, as the rest can be inferred by Weaviate. However, it is recommended to include a description for each class and property, as this will help you and others understand the data structure. - -To define a class, you can use this syntax. - - - - - - - -### Properties with data types - -Each `class` definition will include one or more properties, which must have a data type. If you do not specify a data type, Weaviate will automatically assign one based on your data. But for more predictable results, we recommend that you manually specify them in the schema if possible. - -Currently, Weaviate data type support includes the following types: - -import DataTypes from '/_includes/datatypes.mdx'; - -
- Available data types in Weaviate - -
- -Note that most data types can include one such instance, or an array of instances, such as `text` or `text[]`. - - - - - - - -:::tip Did you get an error? -If you ran the first class creation command shown, and this command, Weaviate will throw an error as the class `Article` already exists. For the purposes of this section, delete the class by running the following command.


- -**Deleting a class should not be done lightly, as deleting a class will delete all of its objects.** - - - - - - - -::: - -### Setting the vectorizer - -The `vectorizer` parameter for the class specifies the Weaviate module that will be used to generate vector embeddings for the class. - -For text objects, you would typically select one of the `text2vec` modules - such as `text2vec-cohere`, `text2vec-huggingface`, `text2vec-openai`, or `text2vec-palm`. - -Modules are enabled at the instance level through its configuration. You can see the list of available modules for your particular instance by running the following command. - - - - - - - - - -:::info What is a `module`, exactly? -By now, you've probably seen mentions of Weaviate `modules` here and there. Modules are optional Weaviate components used to enhance and customize its capabilities.


- -Weaviate Academy units will generally assume WCD usage, which is pre-configured with a set of modules. We will cover how to enable modules for local instances in another unit, or you can see our [Docker installation page](/deploy/installation-guides/docker-installation.md). -::: - -WCD instances come pre-configured with a number of modules. For example, the response below shows that the `text2vec-openai` module is available, so we can use it in our schema. - -
- See the JSON response - -```json -{ - "generative-openai": { - "documentationHref": "https://beta.openai.com/docs/api-reference/completions", - "name": "Generative Search - OpenAI" - }, - "qna-openai": { - "documentationHref": "https://beta.openai.com/docs/api-reference/completions", - "name": "OpenAI Question & Answering Module" - }, - "ref2vec-centroid": {}, - "text2vec-cohere": { - "documentationHref": "https://docs.cohere.com/docs/embeddings", - "name": "Cohere Module" - }, - "text2vec-huggingface": { - "documentationHref": "https://huggingface.co/docs/api-inference/detailed_parameters#feature-extraction-task", - "name": "Hugging Face Module" - }, - "text2vec-openai": { - "documentationHref": "https://beta.openai.com/docs/guides/embeddings/what-are-embeddings", - "name": "OpenAI Module" - } -} -``` - -
- - - - - - - - - -:::info Vectorizers and user-provided vectors -Note that you can set the vectorizer to `None`, if you would prefer to only deal with your own vectors by providing them at import time.


- -In some cases, you can use a vectorizer while uploading them at import time. In this case, you will need to ensure that the vectorizer (e.g. `text2vec-cohere`) is using the same model as the one you used to generate the vectors, so that the vectors are compatible. -::: - -### Class-level module configurations - -You can set the `moduleConfig` parameter at the class-level to set class-wide settings for module behavior. For example, the vectorizer could be configured to set the model used (`model`), or whether to vectorize the class name (`vectorizeClassName`). - - - - - - - - - -### Property-level module configurations - -You can also set the `moduleConfig` parameter at the property level to set module behavior for each property. For example, you could set whether to vectorize the property name (`vectorizePropertyName`), or whether to skip the property from vectorization altogether (`skip`). - -In the following example, the `skip` parameter is set to `True` for the `url` property, so that the URL text will be skipped when producing a vector embedding for the object. - - - - - - - - - -:::note But wait, what about the other options? -There are other settings that we haven't covered yet - such as the index settings, or cluster settings such as those relating to replication. We'll cover these in other units later on. -::: - -## Why so many options? - -This might all seem very complex, especially if you are new to Weaviate or databases. But these options will directly impact how your data is stored and how it will react to various queries. - -We'll ingest some data in the next section, and then you'll see how these options impact the results of your queries. - -## Review - -### Review exercise - -:::note Exercise -Do you have a dataset that you are interested in adding to Weaviate? - -Try to construct a schema for that dataset based on what you've learned here. -::: - -### Key takeaways - -- A schema in Weaviate serves as a blueprint defining the data structure for each class of objects. -- A class represents a collection of objects of the same type. -- Schema definition includes metadata, data properties, the vectorizer, and module configurations. -- Data properties in a class need to be assigned a specific data type, such as `text` or `number`. -- The vectorizer parameter determines which Weaviate module will be used to generate vector embeddings for a class. -- Module configurations at the class and property levels allow customization of module behavior across the entire class or per property, respectively. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/30_import.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/30_import.mdx deleted file mode 100644 index d9c6d4524..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/30_import.mdx +++ /dev/null @@ -1,189 +0,0 @@ ---- -title: How to import data -description: Learn data import techniques in Weaviate for seamless data integration. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/30_import.py'; - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -So far, you've learned that data in Weaviate is represented by objects, which belong to a `class`, and have a set of `properties`. In the previous section on the `schema`, you learned how to create a framework for this structure. - -In this section, you will learn how to import data into Weaviate, including our recommended best practices, and some key considerations. Once you're done with this section, you will be ready to import a real dataset into Weaviate by putting together what we've learned about the schema and imports. - -## Imports: A high-level view - -### Object creation - -To create a Weaviate object, you must: - -- Load your source data, -- Build a data object with any desired properties, and -- Add it to your desired class in Weaviate. - -Optionally, you can manually specify: - -- An object ID, and -- A vector - -:::info Optional properties - -- An object ID is required. So, if one is not specified, Weaviate will create one. -- If a vector is not specified, Weaviate will create one if a vectorizer is set for the class. - -We'll cover these in more detail later on. -::: - -### Use batch imports - -Weaviate offers a "batch import" process to maximize the import speed. Take a look at this diagram that shows the object creation process: - -import ImportConceptualImg from './images/academy_103_import_process_conceptual.png'; - -Conceptual diagram of the import process - -In the figure, a request is made to create an object based on the provided data. - -If the vector is not provided and a vectorizer is specified, Weaviate (core) will send a request to the vectorizer module for a vector embedding. If that module is an inference-API based module, such as `text2vec-cohere`, it must then contact the inference API to request the appropriate vector. - -You can imagine that in such a configuration, the network latencies may add a significant amount of time for large datasets, or become a bottleneck. - -A batch import significantly reduces the impact of network latencies by processing multiple objects per request, and our clients (e.g. the Python client) can parallelize the process as well. You should use batch imports unless you have a good reason not to, as it will significantly improve the speed of data ingestion. - -### Batch import syntax - -The batch syntax is shown below, with some of the parameters: - - - - - - - -Note that here, each `data_object` would be a Python dictionary whose keys correspond to the class properties. - -:::tip Do *not* specify vectors as a part of the `data_object` -Vectors should be specified separately so that Weaviate knows it to be **the** object vector and it can be indexed. See [below](#-example-with-id--vector) for the right syntax on how to specify the vector. -::: - -### Error handling - -The batch import process is configured to handle errors at a batch level, and at individual object level. - -#### Batch-level errors - -Any batch-level errors are indicated via the HTTP response code for the batch request. -- A 200 response code indicates that batch was successfully sent, connected and processed. -- A 4xx status code indicates that the request was malformed. - -#### Object-level errors - -Since a batch contains multiple objects, individual errors may occur during import even when the request was successful. - -Accordingly, object-level errors are returned as a part of the response the batch creation requests. - -### Error handling syntax - -If there are any object-level errors during import, they will be visible in the returned responses to `batch.create_objects()` or `batch.create_references()`. - -In the response, each object's `result["result"]["errors"]` value can be inspected to see if there were any errors. - -A convenient way to do so is to define a callback function and specifying it while initializing the batch: - - - - - - - -This pattern would capture and print any object-level errors found during the import process. - -## Optional parameters - -### ID - -Each object in Weaviate has a unique identifier. This ID must be a [UUID](https://en.wikipedia.org/wiki/Universally_unique_identifier), and can be user-provided, but if not, Weaviate will generate a random UUID. - -Weaviate does not check if a duplicate object is being created. As a result, using a deterministic uuid may prevent accidental creation of duplicate objects. - -### Vector - -Each object in Weaviate can have a vector embedding to represent it, although this is not mandatory. - -- If a vector is specified at creation time, Weaviate will use that vector to represent the object. -- If a vector is not specified, Weaviate will check to see if a vectorizer setting applies to the relevant class. - - If so, Weaviate will send a request to that vectorizer module for a vector embedding. - - If not, the object will not have a vector representation. - -:::info How is a vectorizer determined for the class? -- If a vectorizer was set in the schema for the class, this vectorizer will be used. -- If not, Weaviate will check for a default vectorizer setting for that Weaviate instance. -::: - -:::note Upload vectors and use a vectorizer -It is possible to both upload your own vectors and specify a vectorizer for Weaviate. For example, if you are importing a large dataset and have vectorized the data using a vectorizer that is also available through Weaviate, this may be a useful approach. This will allow you to use Weaviate to vectorize any updates to the dataset, as well as to vectorize queries as necessary. -::: - -We will explore these options in more detail in another unit. - -### Example with id & vector - -To manually specify the object ID and vector, the syntax is as follows. - - - - - - - -With these parameters, you have the option of manually specifying the object ID and vector. - -## Review - -### Review exercise - -:::note Exercise -- After studying the batch import process, try to explain it in your own words. - - Include in your explanation what the batch import process is, why it is used, and how it improves the speed of data ingestion. -- Can you recall which parameters were optional in the object creation process? - - Can you imagine scenarios in which you might want to, or not want to, specify these parameters? -::: - -### Key takeaways - -- To create a Weaviate object, you need to build a data object with properties, and add it to your desired class. -- Use batch imports to maximize import speed and minimize network latency. Batch import processes multiple objects per request, and clients can parallelize the process. -- Error handling during import can be done at the batch level or individual object level. -- You can manually specify an object ID and vector for each object in Weaviate. - - If an ID is not specified, Weaviate will create one. - - If a vector is not specified, Weaviate will create one if a vectorizer is specified. - - The vectorizer setting can be set in the schema. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/40_example.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/40_example.mdx deleted file mode 100644 index eff7023a8..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/40_example.mdx +++ /dev/null @@ -1,289 +0,0 @@ ---- -title: Populate your Weaviate instance! -description: View examples of schema creation in Weaviate for effective data setup. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/40_import_example_1.py'; - -## Overview - -import ReactPlayer from 'react-player/lazy' - - -
- -It's time to put what we've learned into action! In this section, we will: - -- Download a small dataset, -- Build a schema corresponding to the dataset, and -- Import it to your WCD instance. - -### Dataset used - -We are going to use data from a popular quiz game show called *Jeopardy!*. - -The original dataset can be found [here on Kaggle](https://www.kaggle.com/datasets/tunguz/200000-jeopardy-questions), but we'll use a [small subset from it, just containing 100 rows](https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/jeopardy_100.json). - -Here's a preview of a few rows of data. - -| | Air Date | Round | Value | Category | Question | Answer | -|---:|:-----------|:-----------------|--------:|:-----------------|:---------------------------------------------------------------------------------------------------------------------|:-------------------------------------------| -| 0 | 2006-11-08 | Double Jeopardy! | 800 | AMERICAN HISTORY | Abraham Lincoln died across the street from this theatre on April 15, 1865 | Ford's Theatre (the Ford Theatre accepted) | -| 1 | 2005-11-18 | Jeopardy! | 200 | RHYME TIME | Any pigment on the wall so faded you can barely see it | faint paint | -| 2 | 1987-06-23 | Double Jeopardy! | 600 | AMERICAN HISTORY | After the original 13, this was the 1st state admitted to the union | Vermont | - -For now, let's keep it simple by populating Weaviate with just the `Round`, `Value`, `Question` and `Answer` columns. - -:::note Exercise -Can you remember what the next steps should be? -::: - -## Build a schema - -The next step is to build a schema, making some decisions about how to represent our data in Weaviate. - -### Add class names & properties - -First of all, we'll need a name. The name refers to each row or item (*note: singular*), so I called it `JeopardyQuestion`. Then, I need to define properties and types. - -You saw above that we'll be populating Weaviate with `Round`, `Value`, `Question` and `Answer` columns. We need names for Weaviate `properties` - these names are sensible, but we follow the GraphQL convention of capitalizing classes and leaving properties as lowercases, so the names will be `round`, `value`, `question` and `answer`. - -Then, we should select datatypes. All of `round`, `question` and `answer` are text, so we can simply choose `text` as our datatype. `value` is a number, but I know that values in *Jeopardy!* represent dollar amounts, meaning that they are always integers. So we'll use `int`. - - - - - - - -### Set & configure the vectorizer - -For this example, we will obtain our object vectors using an inference service. So to do that, we must set the `vectorizer` for the class. We'll use `text2vec-openai` in this case, and we can configure the module also at the class-level. - - - - - - - -### Skipping a property from vectorization - -You might have noticed the property-level module configuration here: - - - - - - - -This configuration will exclude the `round` property from the vectorized text. You might be asking - why might we choose to do this? - -Well, the answer is that whether the question belonged to "Jeopardy!", or "Double Jeopardy!" rounds simply do not add much to impact its meaning. You know by now that the vectorizer creates a vector representation of the object. In case of a text object, Weaviate first combines the text data according to an internal set of rules and your configuration. - -It is the combined text that is vectorized. So, the difference between vectorizing the `round` property and skipping it would be something like this: - -```json -// If the property is vectorized -answer {answer_text} question {question_text} category {category_text} -``` - -Against: - -```json -// If the property is skipped -answer {answer_text} question {question_text} -``` - -More specifically, something like the difference between: - -```json -// If the property is vectorized -answer faint paint question any pigment on the wall so faded you can barely see it category double jeopardy! -``` - -Against: - -```json -// If the property is skipped -answer faint paint question any pigment on the wall so faded you can barely see it -``` - -The additional information is not particularly significant in capturing the meaning of the quiz item, which is mainly in the question and answer, as well as perhaps the category (not yet used). - -:::tip Skipping vectorization has no impact on filtering -Importantly, excluding the `round` column from vectorization will have no impact on our ability to filter the results based on the `round` value. So if you wanted to only search a set of `Double Jeopardy!` questions, you still can. -::: - -### Create the class - -We can now add the class to the schema. - - - - - - - -Now, you can check that the class has been created successfully by retrieving its schema: - - - - - - - -
- See the full schema response - - - -
- -:::note The retrieved schema is even longer! -Although we've defined a lot of details here, the retrieved schema is still longer. The additional details relate to the vector index, the inverted index, sharding and tokenization. We'll cover many of those as we go. -::: - -If you see a schema that is close to the example response - awesome! You're ready to import the data. - -## Import data - -Here, we'll show you how to import the requisite data, including how to configure and use a batch. - -### Load data - -We've made the data available online - so, fetch and load it like so: - - - -### Configure batch and import data - -And let's set up a batch import process. As mentioned earlier, the batch import process in Weaviate can send data in bulk and in parallel. - -In Python, we recommend that you use a context manager like: - - - -Note the use of parameters `batch_size` and `num_workers`. They specify the number of objects sent per batch, as well as how many processes are used for parallelization. - -Then, the next step is to build data objects & add them to the batch process. We build objects (as Python dictionaries) by passing data from corresponding columns to the right Weaviate property, and the client will take care of when to send them. - - - -Then, let's check that we've got the right number of objects imported: - - - -If this assertion returns `True`, you've successfully populated your Weaviate instance! - -### What happens if this runs again? - -Before we go on, I have a question. What do you think will happen if you run the above import script again? - -
- The answer is... - -That you will end up with duplicate items!


- -Weaviate does not check if you are uploading items with the same properties as ones that exist already. And since the import script did not provide an ID, Weaviate will simply assign a new, random ID, and create new objects. - -
- -### Specify object UUID - -You could specify an object UUID at import time to serve as the object identifier. The Weaviate Python client, for example, provides a function to create a deterministic UUID based on an object. So, it could be added to our import script as shown below: - - - -What this will do is to create objects whose UUID is based on the object properties. Accordingly, if the object properties remain the same, so will the UUID. - -Running the above script multiple times will *not* cause the number of objects to increase. - -:::tip What is your desired behavior? -Because the UUID is based on the object properties, it will still create new objects in case some property has changed. So, when you design your import process, consider what properties might change, and how you would want Weaviate to behave in these scenarios.


- -Then you could, for instance, design your UUID to be created based on a subset of unique properties, to have the objects be overwritten, or alternatively have the UUID be created from the entire set of properties to only prevent duplicates. -::: - -### Full import script - -Putting it all together, we get the following import script: - - - -## Review - -### Key takeaways - -We have: -- Downloaded a small dataset of Jeopardy! questions and answers. -- Built a schema and imported our data. -- Verified the successful import by checking the object count in Weaviate. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/90_wrap_up.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/90_wrap_up.mdx deleted file mode 100644 index f85432d44..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/90_wrap_up.mdx +++ /dev/null @@ -1,25 +0,0 @@ ---- -title: Wrap-up -description: Schema and Imports Wrap-up ---- - -## Unit review - -In this unit, you saw how data is structured in Weaviate, and got hands-on experience populating your own instance of Weaviate from start to finish. - -You have got hands-on experience in defining and creating a schema including classes, properties, data types and vectorization configurations. You also had a chance to import data according to this schema, and learned how to use batch imports to ensure that imports are as fast as possible. - -Now, you should have a broad understanding of Weaviate's indexes and schema, and how they work to store data to enable efficient, flexible, and powerful retrieval. - -### Learning outcomes - -Having finished this unit, you should be able to: -- Describe how the schema and indexes relate to how data is stored in Weaviate. -- Understand how classes and properties represent your data, and how to define them. -- Populate Weaviate with data, using batch imports. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/05_create_instance.py b/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/05_create_instance.py deleted file mode 100644 index 75fc9ae28..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/05_create_instance.py +++ /dev/null @@ -1,38 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -client.is_ready() # This should return `True` -# ===== END Instantiate Weaviate client w/ auth config ===== -assert client.is_ready() # This should return `True` - -# ===== Instantiate Weaviate client w/o auth ===== -import weaviate - -client = weaviate.Client( - url="https://anon-endpoint.weaviate.network", # Replace with your Weaviate endpoint - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -client.is_ready() # This should return `True` -# ===== END Instantiate Weaviate client w/o auth ===== -assert client.is_ready() # This should return `True` - -# ===== Confirm that the client can access the instance ===== -response = client.data_object.create({"name": "dummy"}, "TestClass") -print(response) # This should be a UUID, like "59340403-4bcd-479f-ae9c-de5da039ac0e" -# ===== END Confirm that the client can access the instance ===== -assert response.count("-") == 4 # This should be a UUID, like "59340403-4bcd-479f-ae9c-de5da039ac0e" - -# ===== Delete our test object ===== -client.data_object.delete(response, class_name="TestClass") -# ===== END Delete our test object ===== diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/20_schema.py b/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/20_schema.py deleted file mode 100644 index 845154b51..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/20_schema.py +++ /dev/null @@ -1,258 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# ===== Create a class with metadata ===== -class_obj = { - "class": "Article", -} - -client.schema.create_class(class_obj) -# ===== END Create a class with metadata ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "Article" in classes - -# ===== Delete the class ===== -client.schema.delete_class("Article") -# ===== END Delete the class ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "Article" not in classes - -# ===== Create properties with datatypes ===== -class_obj = { - "class": "Article", - "properties": [ - { - "name": "title", - # highlight-start - "dataType": ["text"], - # highlight-end - }, - { - "name": "body", - # highlight-start - "dataType": ["text"], - # highlight-end - }, - { - "name": "url", - # highlight-start - "dataType": ["text"], - # highlight-end - }, - ], -} - -client.schema.create_class(class_obj) -# ===== END Create properties with datatypes ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -property_names = [c["name"] for c in client.schema.get("Article")["properties"]] -datatypes = [c["dataType"] for c in client.schema.get("Article")["properties"]] -assert "Article" in classes -for p in ["title", "body", "url"]: - assert p in property_names -assert datatypes == [["text"], ["text"], ["text"]] - -# ===== NOT SHOWN - Delete the class ===== -client.schema.delete_class("Article") -# ===== NOT SHOWN - Delete the class ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "Article" not in classes - -# ===== Get module list ===== -module_metadata = client.get_meta() -module_metadata['modules'] -# ===== END Get module list ===== - -assert 'text2vec-openai' in module_metadata['modules'].keys() - -# ===== Create a class with a vectorizer ===== -class_obj = { - "class": "Article", - "properties": [ - { - "name": "title", - "dataType": ["text"], - }, - { - "name": "body", - "dataType": ["text"], - }, - { - "name": "url", - "dataType": ["text"], - }, - ], - # highlight-start - "vectorizer": "text2vec-openai" - # highlight-end -} - -client.schema.create_class(class_obj) -# ===== END Create a class with a vectorizer ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -property_names = [c["name"] for c in client.schema.get("Article")["properties"]] -datatypes = [c["dataType"] for c in client.schema.get("Article")["properties"]] -vectorizer = client.schema.get("Article")["vectorizer"] -assert "Article" in classes -for p in ["title", "body", "url"]: - assert p in property_names -assert datatypes == [["text"], ["text"], ["text"]] -assert vectorizer == "text2vec-openai" - -# ===== NOT SHOWN - Delete the class ===== -client.schema.delete_class("Article") -# ===== NOT SHOWN - Delete the class ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "Article" not in classes - -# ===== Class-level moduleConfig ===== -class_obj = { - "class": "Article", - # highlight-start - "moduleConfig": { - "text2vec-openai": { - "vectorizeClassName": False, - "model": "ada", - "modelVersion": "002", - "type": "text" - } - }, - # highlight-end - "properties": [ - { - "name": "title", - "dataType": ["text"], - }, - { - "name": "body", - "dataType": ["text"], - }, - { - "name": "url", - "dataType": ["text"], - }, - ], - "vectorizer": "text2vec-openai" -} - -client.schema.create_class(class_obj) -# ===== END Class-level moduleConfig ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -property_names = [c["name"] for c in client.schema.get("Article")["properties"]] -datatypes = [c["dataType"] for c in client.schema.get("Article")["properties"]] -vectorizer = client.schema.get("Article")["vectorizer"] -module_config = client.schema.get("Article")["moduleConfig"] -assert "Article" in classes -for p in ["title", "body", "url"]: - assert p in property_names -assert datatypes == [["text"], ["text"], ["text"]] -assert vectorizer == "text2vec-openai" -assert module_config[vectorizer]["vectorizeClassName"] == False -assert module_config[vectorizer]["model"] == "ada" -assert module_config[vectorizer]["modelVersion"] == "002" - -# ===== NOT SHOWN - Delete the class ===== -client.schema.delete_class("Article") -# ===== NOT SHOWN - Delete the class ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "Article" not in classes - -# ===== Property-level moduleConfig ===== -class_obj = { - "class": "Article", - "moduleConfig": { - "text2vec-openai": { - "vectorizeClassName": False, - "model": "ada", - "modelVersion": "002", - "type": "text" - } - }, - "properties": [ - { - "name": "title", - "dataType": ["text"], - }, - { - "name": "body", - "dataType": ["text"], - # highlight-start - "moduleConfig": { - "text2vec-openai": { - "skip": False, - "vectorizePropertyName": True - } - } - # highlight-end - }, - { - "name": "url", - "dataType": ["text"], - # highlight-start - "moduleConfig": { - "text2vec-openai": { - "skip": True, - } - } - # highlight-end - }, - ], - "vectorizer": "text2vec-openai" -} - -client.schema.create_class(class_obj) -# ===== END Property-level moduleConfig ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -property_names = [c["name"] for c in client.schema.get("Article")["properties"]] -datatypes = [c["dataType"] for c in client.schema.get("Article")["properties"]] -vectorizer = client.schema.get("Article")["vectorizer"] -module_config = client.schema.get("Article")["moduleConfig"] -assert "Article" in classes -for p in ["title", "body", "url"]: - assert p in property_names -assert datatypes == [["text"], ["text"], ["text"]] -assert vectorizer == "text2vec-openai" -assert module_config[vectorizer]["vectorizeClassName"] == False -assert module_config[vectorizer]["model"] == "ada" -assert module_config[vectorizer]["modelVersion"] == "002" -properties = client.schema.get("Article")["properties"] -for p in properties: - if p["name"] == "body": - assert p["moduleConfig"]["text2vec-openai"]["skip"] == False - assert p["moduleConfig"]["text2vec-openai"]["vectorizePropertyName"] == True - if p["name"] == "url": - assert p["moduleConfig"]["text2vec-openai"]["skip"] == True - -# ===== NOT SHOWN - Delete the class ===== -client.schema.delete_class("Article") -# ===== NOT SHOWN - Delete the class ===== - -# Test -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "Article" not in classes diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/30_import.py b/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/30_import.py deleted file mode 100644 index d1a0dc602..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/30_import.py +++ /dev/null @@ -1,112 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# An example object -data_object = { - "title": "Apollo 8", - "body": "Apollo 8 (December 21–27, 1968) was...", - "url": "https://en.wikipedia.org/wiki/Apollo_8" -} -# END An example object - -batch_size = 10 -num_workers = 1 -n_retries = 3 -data_objects = [data_object] -target_class = "TestClass" - -# Batch import example -# highlight-start -with client.batch( - batch_size=batch_size, # Specify batch size - num_workers=num_workers, # Parallelize the process - dynamic=True, # Enable/Disable dynamic batch size change - timeout_retries=n_retries, # Number of retries if a timeout occurs - connection_error_retries=n_retries, # Number of retries if a connection error occurs -) as batch: -# highlight-end - for data_object in data_objects: - batch.add_data_object( - data_object, - class_name=target_class - ) -# END Batch import example - -results = client.query.get(target_class, list(data_object.keys())).do() -assert len(results["data"]["Get"][target_class]) == 1 -client.schema.delete_class(target_class) - -from weaviate.util import generate_uuid5 -uuid = generate_uuid5(data_object) -object_vector = [1, 2, 3] - -# Example with additional properties -with client.batch( - batch_size=batch_size, # Specify batch size - num_workers=num_workers, # Parallelize the process - dynamic=True, # Enable/Disable dynamic batch size change - timeout_retries=n_retries, # Number of retries if a timeout occurs - connection_error_retries=n_retries, # Number of retries if a connection error occurs -) as batch: - for data_object in data_objects: - batch.add_data_object( - data_object, - class_name=target_class, - # highlight-start - uuid=uuid, - vector=object_vector, - # highlight-end - ) -# END Example with additional properties - -results = client.query.get(target_class, list(data_object.keys())).with_additional(["id", "vector"]).do() -assert len(results["data"]["Get"][target_class]) == 1 -assert results["data"]["Get"][target_class][0]["_additional"]["id"] == uuid -assert results["data"]["Get"][target_class][0]["_additional"]["vector"] == object_vector -client.schema.delete_class(target_class) - -# Example with callback -# highlight-start -def check_batch_result(results: dict): - """ - Check batch results for errors. - - Parameters - ---------- - results : dict - The Weaviate batch creation return value. - """ - - if results is not None: - for result in results: - if "result" in result and "errors" in result["result"]: - if "error" in result["result"]["errors"]: - print(result["result"]) -# highlight-end - -with client.batch( - batch_size=batch_size, # Specify batch size - num_workers=num_workers, # Parallelize the process - dynamic=True, # Enable/Disable dynamic batch size change - timeout_retries=n_retries, # Number of retries if a timeout occurs - connection_error_retries=n_retries, # Number of retries if a connection error occurs - # highlight-start - callback=check_batch_result, - # highlight-end -) as batch: - for data_object in data_objects: - batch.add_data_object( - data_object, - class_name=target_class, - uuid=uuid, - vector=object_vector, - ) -# END Example with callback diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/40_import_example_1.py b/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/40_import_example_1.py deleted file mode 100644 index dd692bf72..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/_snippets/40_import_example_1.py +++ /dev/null @@ -1,341 +0,0 @@ -# Full code snippet -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate -from weaviate.util import generate_uuid5 -import requests -import json - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your Weaviate instance API key. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# Define the class -class_obj = { - # Class & property definitions - "class": "JeopardyQuestion", - "properties": [ - { - "name": "round", - "dataType": ["text"], - # Property-level module configuration for `round` - "moduleConfig": { - "text2vec-openai": { - "skip": True, - } - }, - # End of property-level module configuration - }, - { - "name": "value", - "dataType": ["int"], - }, - { - "name": "question", - "dataType": ["text"], - }, - { - "name": "answer", - "dataType": ["text"], - }, - ], - - # Specify a vectorizer - "vectorizer": "text2vec-openai", - - # Module settings - "moduleConfig": { - "text2vec-openai": { - "vectorizeClassName": False, - "model": "ada", - "modelVersion": "002", - "type": "text" - } - }, -} -# End class definition - -client.schema.create_class(class_obj) -# Finished creating the class - -url = 'https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/jeopardy_100.json' -resp = requests.get(url) -data = json.loads(resp.text) - -# Context manager for batch import -with client.batch( - batch_size=200, # Specify batch size - num_workers=2, # Parallelize the process -) as batch: - # Build data objects & add to batch - for i, row in enumerate(data): - question_object = { - "question": row["Question"], - "answer": row["Answer"], - "value": row["Value"], - "round": row["Round"], - } - batch.add_data_object( - question_object, - class_name="JeopardyQuestion", - uuid=generate_uuid5(question_object) - ) - -# END Full code snippet - -# Test data ingestion -def test_class_addition(client_in): - class_schema = client_in.schema.get("JeopardyQuestion") - assert class_schema["class"] == "JeopardyQuestion" - assert class_schema["vectorizer"] == "text2vec-openai" - assert len(class_schema["properties"]) == 4 -test_class_addition(client) -assert client.query.aggregate("JeopardyQuestion").with_meta_count().do()["data"]["Aggregate"]["JeopardyQuestion"][0]["meta"]["count"] == 100 -# Cleanup -client.schema.delete_class("JeopardyQuestion") -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "JeopardyQuestion" not in classes -# END Test data ingestion - -# ============================================================ -# ========== SMALLER CODE SNIPPETS =========================== -# ============================================================ - -client.schema.create_class(class_obj) - -# Retrieve "JeopardyQuestion" class schema -client.schema.get("JeopardyQuestion") -# END Retrieve "JeopardyQuestion" class schema - -''' -// RETRIEVED CLASS SCHEMA -{ - "class": "JeopardyQuestion", - "invertedIndexConfig": { - "bm25": { - "b": 0.75, - "k1": 1.2 - }, - "cleanupIntervalSeconds": 60, - "stopwords": { - "additions": null, - "preset": "en", - "removals": null - } - }, - "moduleConfig": { - "text2vec-openai": { - "model": "ada", - "modelVersion": "002", - "type": "text", - "vectorizeClassName": false - } - }, - "properties": [ - { - "dataType": [ - "text" - ], - "indexFilterable": true, - "indexSearchable": true, - "moduleConfig": { - "text2vec-openai": { - "skip": true, - "vectorizePropertyName": false - } - }, - "name": "round", - "tokenization": "word" - }, - { - "dataType": [ - "int" - ], - "indexFilterable": true, - "indexSearchable": false, - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - }, - "name": "value" - }, - { - "dataType": [ - "text" - ], - "indexFilterable": true, - "indexSearchable": true, - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - }, - "name": "question", - "tokenization": "word" - }, - { - "dataType": [ - "text" - ], - "indexFilterable": true, - "indexSearchable": true, - "moduleConfig": { - "text2vec-openai": { - "skip": false, - "vectorizePropertyName": false - } - }, - "name": "answer", - "tokenization": "word" - } - ], - "replicationConfig": { - "factor": 1 - }, - "shardingConfig": { - "virtualPerPhysical": 128, - "desiredCount": 1, - "actualCount": 1, - "desiredVirtualCount": 128, - "actualVirtualCount": 128, - "key": "_id", - "strategy": "hash", - "function": "murmur3" - }, - "vectorIndexConfig": { - "skip": false, - "cleanupIntervalSeconds": 300, - "maxConnections": 32, - "efConstruction": 128, - "ef": -1, - "dynamicEfMin": 100, - "dynamicEfMax": 500, - "dynamicEfFactor": 8, - "vectorCacheMaxObjects": 1000000000000, - "flatSearchCutoff": 40000, - "distance": "cosine", - "pq": { - "enabled": false, - "segments": 0, - "centroids": 256, - "encoder": { - "type": "kmeans", - "distribution": "log-normal" - } - } - }, - "vectorIndexType": "hnsw", - "vectorizer": "text2vec-openai" -} -// END RETRIEVED CLASS SCHEMA -''' - -# Load data -import requests -import json -url = 'https://raw.githubusercontent.com/weaviate-tutorials/edu-datasets/main/jeopardy_100.json' -resp = requests.get(url) -data = json.loads(resp.text) -# Finished loading data - -# Import data - simple version -with client.batch( - batch_size=200, # Specify batch size - num_workers=2, # Parallelize the process -) as batch: - # Build data objects & add to batch - for i, row in enumerate(data): - question_object = { - "question": row["Question"], - "answer": row["Answer"], - "value": row["Value"], - "round": row["Round"], - } - batch.add_data_object( - question_object, - class_name="JeopardyQuestion" - ) -# END Import data - simple version - -# Check object count -assert client.query.aggregate("JeopardyQuestion").with_meta_count().do()["data"]["Aggregate"]["JeopardyQuestion"][0]["meta"]["count"] == 100 -# END Check object count - -# Import data again - to demonstrate what happens if duplicated -with client.batch( - batch_size=200, # Specify batch size - num_workers=2, # Parallelize the process -) as batch: - for i, row in enumerate(data): - question_object = { - "question": row["Question"], - "answer": row["Answer"], - "value": row["Value"], - "round": row["Round"], - } - batch.add_data_object( - question_object, - class_name="JeopardyQuestion" - ) -# END Import data again - to demonstrate what happens if duplicated -assert client.query.aggregate("JeopardyQuestion").with_meta_count().do()["data"]["Aggregate"]["JeopardyQuestion"][0]["meta"]["count"] == 200 - -# Cleanup -client.schema.delete_class("JeopardyQuestion") -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "JeopardyQuestion" not in classes - -client.schema.create_class(class_obj) -test_class_addition(client) - - -# Import data with deterministic UUIDs -from weaviate.util import generate_uuid5 - -with client.batch( - batch_size=200, # Specify batch size - num_workers=2, # Parallelize the process -) as batch: - for i, row in enumerate(data): - question_object = { - "question": row["Question"], - "answer": row["Answer"], - "value": row["Value"], - "round": row["Round"], - } - batch.add_data_object( - question_object, - class_name="JeopardyQuestion", - uuid=generate_uuid5(question_object) - ) -# END Import data with deterministic UUIDs - -# Test -assert client.query.aggregate("JeopardyQuestion").with_meta_count().do()["data"]["Aggregate"]["JeopardyQuestion"][0]["meta"]["count"] == 100 -with client.batch( - batch_size=200, # Specify batch size - num_workers=2, # Parallelize the process -) as batch: - for i, row in enumerate(data): - question_object = { - "question": row["Question"], - "answer": row["Answer"], - "value": row["Value"], - "round": row["Round"], - } - batch.add_data_object( - question_object, - class_name="JeopardyQuestion", - uuid=generate_uuid5(question_object) - ) -assert client.query.aggregate("JeopardyQuestion").with_meta_count().do()["data"]["Aggregate"]["JeopardyQuestion"][0]["meta"]["count"] == 100 - -client.schema.delete_class("JeopardyQuestion") -classes = [c["class"] for c in client.schema.get()["classes"]] -assert "JeopardyQuestion" not in classes diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/images/academy_103_import_process_conceptual.png b/docs/academy/py/zero_to_mvp/103_schema_and_imports/images/academy_103_import_process_conceptual.png deleted file mode 100644 index e303b7f79..000000000 Binary files a/docs/academy/py/zero_to_mvp/103_schema_and_imports/images/academy_103_import_process_conceptual.png and /dev/null differ diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/images/academy_103_import_process_conceptual_simple.png b/docs/academy/py/zero_to_mvp/103_schema_and_imports/images/academy_103_import_process_conceptual_simple.png deleted file mode 100644 index decd7d919..000000000 Binary files a/docs/academy/py/zero_to_mvp/103_schema_and_imports/images/academy_103_import_process_conceptual_simple.png and /dev/null differ diff --git a/docs/academy/py/zero_to_mvp/103_schema_and_imports/index.mdx b/docs/academy/py/zero_to_mvp/103_schema_and_imports/index.mdx deleted file mode 100644 index d9aa33c75..000000000 --- a/docs/academy/py/zero_to_mvp/103_schema_and_imports/index.mdx +++ /dev/null @@ -1,49 +0,0 @@ ---- -title: P3_103 Schema and Imports (Python) -description: Define schema and import data into Weaviate to set up your dataset structure. -sidebar_position: 103 ---- - -import ZeroToMvpDeprecationNote from '/docs/academy/py/zero_to_mvp/_snippets/deprecation.md'; - - - -## Unit overview - -import ReactPlayer from 'react-player/lazy' - - -
- - - -The previous units introduced you to the fundamentals of Weaviate, its capabilities, and how to execute basic queries on existing data. Now that you're familiar with the various query types, it's time to explore how to populate your own Weaviate instance with data. - -In this unit, we'll examine how to use Weaviate to effectively structure your data so that you can retrieve the right information the way you want. We'll delve into defining a schema for your data and importing data into Weaviate. - -By the end of this unit, Weaviate's overall data architecture will start to become clearer in your mind. This will start to empower you to build a vector database that really suits your needs and goals. - -Let's get started. - -### Prerequisites - -- (**Required**) A Python (3) environment with `weaviate-client` installed. -- (**Required**) Complete [101A Weaviate Academy Preparation](../setup.mdx) -- (*Recommended*) Complete [Hello, Weaviate](../101_hello_weaviate/index.mdx) -- (*Recommended*) Complete [Queries 1](../102_queries_1/index.mdx) - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/10_bm25.mdx b/docs/academy/py/zero_to_mvp/104_queries_2/10_bm25.mdx deleted file mode 100644 index 838a38ba5..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/10_bm25.mdx +++ /dev/null @@ -1,213 +0,0 @@ ---- -title: BM25 (Keyword) searches -description: Discover how to implement BM25 queries in Weaviate to optimize search results effectively. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/10_bm25.py'; - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -[//]: # (Image alt) - -## Overview - -A BM25 search is one implementation of what is commonly called a 'keyword' search. Broadly speaking, it works by matching the search terms between the query and the data objects in the index. - -## About `bm25` queries - -### How it works - -When a user submits a BM25 query, Weaviate will look for objects that contain the search terms in the text properties of the objects. Then, it will rank the results based on how many times the search terms appear in the text properties of the objects. - -In this way, a BM25 query is different to keyword-based filtering, which simply includes or excludes objects based on the provided set of conditions. - -### `bm25` query syntax - -A BM25 query is shown below. Each BM25 query: - -- **Must** include a query string, which can be any length, -- **Can optionally** include a list of `properties` to search, -- **Can optionally** include weights for each searched property, and -- **Can optionally** request a `score` for each result. - - - - - - - -The above query will return the top 3 objects based on its BM25F score, based on the query string `"food"`. The query will search the `question` and `answer` properties of the objects, from which `question` property will be boosted by a factor of 3. - -
- See the JSON response - - - -
- -:::tip Exercise -Try varying the boost factor, and the query string. What happens to the results? -::: - -## Tokenization and `bm25` searches - -### Why tokenization matters - -In an earlier unit, we briefly discussed [the inverted index](../103_schema_and_imports/10_data_structure.mdx#-inverted-index), and that it stores a "tokenized" index of data. - -When a BM25 query is submitted, Weaviate will search each property according to its tokenization property. For example, if a property is tokenized with the `word` tokenization option, it will tokenize the query string into its constituent, lowercase, words, and search for each word in the index. On the other hand, if a property uses a `field` tokenization, Weaviate will look for the entire query string in the index. - -:::tip This applies to the inverted index only -This is different to any tokenization in the context of, for example, language models or vectorization models. Tokenization in the context of the current section only applies to the inverted index. -::: - -More concretely, let's take a look at some examples. - -### `word` tokenization - -In this example, we search through the `question` property with the query string `Jeopardy`. The `question` property is tokenized with the `word` tokenization option. - - - - - - - -The `word` tokenization keeps alpha-numeric characters in lowercase, and splits them by whitespace. Accordingly, the search results include those where the `question` property contains the string `Jeopardy!`, which is the title of the TV show. - -
- See the JSON response - - - -
- -Now, let's take a look at the same query, but with the `field` tokenization option. - -### `field` tokenization - -In this example, the query string remains the same (`Jeopardy`), however we are now searching the `round` property, which is tokenized with the `field` tokenization option. - - - - - - - -The `field` tokenization trims whitespace characters and then keeps the entire string as is. Accordingly, the search does not return any results, even though we know that `round` values include those such as `Jeopardy!` and `Double Jeopardy!`. - -
- See the JSON response - - - -
- -:::tip Exercise -Try changing the query to `Jeopardy!`. What happens to the results? -::: - -### Rules of thumb - -The [full list of tokenization options](/weaviate/config-refs/collections.mdx#tokenization) are `word`, `whitespace`, `lowercase` and `field`. A rule of thumb on when to use each option is to use `word` for long text where you want to retrieve partial matches, and `field` for short text where you only want to retrieve exact matches. The others are somewhere in between, and may be useful in specific situations, where for example you want case to matter (`whitespace`) or special characters to be respected (`lowercase`). - -## BM25F scoring - -The exact algorithm used for scoring and ranking the results is called the BM25F algorithm. The details are beyond the scope of this course, but the gist is that the BM25F algorithm is a variant of the BM25 algorithm, where the `F` stands for "field". It is used to score and rank results based on the fields that are searched. - -If you would like to delve into the details of the exact algorithm, you can review [this Wikipedia page](https://en.wikipedia.org/wiki/Okapi_BM25#Modifications). - -## Review - - - - - -### Key takeaways - -- BM25 search matches search terms between the query and data objects in the index and ranks results based on the frequency of those terms. -- A BM25 query must include a query string, and can optionally include a list of properties to search, weights for each searched property, and a request for a score for each result. -- BM25 queries are impacted by the tokenization of the properties being searched; for instance, `word` tokenization splits the query string into lowercase words and `field` tokenization searches for the entire query string. -- Consider your search use case for picking tokenization options. For example, use `word` for long text with partial matches, and `field` for short text with exact matches. -- BM25F scoring, where 'F' stands for 'field', is used to score and rank the search results based on the fields that are searched. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const Bm25Question = [{ - questionText: 'What does a BM25 search do?', - answerOptions: [ - { - answerText: 'Matches search terms between the query and data objects in the index and ranks results based on the frequency of those terms.', - isCorrect: true, - feedback: 'It is a keyword search with ranking based on term frequency.', - }, - { - answerText: 'Excludes objects based on the provided set of conditions.', - isCorrect: false, - feedback: 'This describes a filter.', - }, - { - answerText: 'Searches for exact matches of the entire query string in the index.', - isCorrect: false, - feedback: 'This is describing a form of tokenization (field tokenization), not BM25 search.', - }, - ] -}]; -export const wordTokenizationQuestion = [{ - questionText: 'What does the `word` tokenization option do?', - answerOptions: [ - { - answerText: 'Lowercases the query string and splits it by whitespace.', - isCorrect: false, - feedback: 'This is only partially true.', - }, - { - answerText: 'Indexes each string as-is.', - isCorrect: false, - feedback: 'This is the `field` tokenization.', - }, - { - answerText: 'Lowercases the query string, keeps alpha-numeric characters and splits it by whitespace.', - isCorrect: true, - feedback: 'Understanding different tokenization options and their impact can be very useful.', - }, - ] -}]; diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/20_hybrid.mdx b/docs/academy/py/zero_to_mvp/104_queries_2/20_hybrid.mdx deleted file mode 100644 index 5435b8588..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/20_hybrid.mdx +++ /dev/null @@ -1,148 +0,0 @@ ---- -title: Hybrid searches -description: Combine vector and keyword search strategies for precision with hybrid queries in Weaviate. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/20_hybrid.py'; - - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - - - -[//]: # (Image alt) - -## Overview - -A hybrid search combines `bm25` searches that you just learned about with a vector search, producing rankings from a combination of the two results. - -This can produce helpful results when a vector search or a keyword search alone is not producing desired results. For example, it may be useful when a vector search alone is producing too many irrelevant results, and you want particular keywords to weight the results a certain way. - -## About `hybrid` queries - -### How it works - -A hybrid search works by combining the results of a `bm25` search with the results of a vector search. More specifically, it uses a combination of each result's BM25F search ranking and its vector search ranking among the set of results. - -The sum of the inverse of the BM25F ranking and the vector search ranking is used to produce a final score for each result, with any weighting (`alpha`) applied if applicable. The final score is then used to rank the results. - -This has the effect of rewarding results that score high in at least one of the searches. For example, take the following five results: - -- Result 1: BM25F ranking = 5, vector search ranking = 1 -> Total score: 1.2 -- Result 2: BM25F ranking = 4, vector search ranking = 2 -> Total score: 0.75 -- Result 3: BM25F ranking = 3, vector search ranking = 3 -> Total score: 0.67 -- Result 4: BM25F ranking = 2, vector search ranking = 4 -> Total score: 0.75 -- Result 5: BM25F ranking = 1, vector search ranking = 5 -> Total score: 1.2 - -In this example, results 1 and 5 end up being the top results, because they scored high in at least one of the searches. On the other hand, result 3, which was middle-of-the-pack in both searches, ends up being the lowest-ranked result. - -So, hybrid search will bring to the top results that score high in at least one of the searches, while middling results will end up in the lower end of the re-ranking. - -### `hybrid` query syntax - -A hybrid query is shown below. Each hybrid query: - -- **Must** include a query string, which can be any length, -- **Can optionally** include a list of `properties` to search, -- **Can optionally** include an `alpha` value, -- **Can optionally** include a `vector` to search for, -- **Can optionally** request a `score` and an `explainScore` value for each result. - - - - - - - -The above query will return the top 3 objects based on its BM25F score and `nearText` similarity, based on the query string `"food"`. The query will search the `question` and `answer` properties of the objects for the BM25F score (while the object vectors remain unaffected by the `properties` selection). - -
- See the JSON response - - - -
- -## `hybrid` search parameters - -A `hybrid` search includes multiple parameters, some of which you may be familiar with from the earlier `bm25` search discussions. - -The `query` parameter and `properties` parameter are the same as in a `bm25` search, with the exception that currently, the boost parameter is not supported in a `hybrid` search. Some of the parameters, however, are unique to a `hybrid` search. - -[//]: # (### `vector`) - -[//]: # () -[//]: # (The `vector` parameter is optional. If you do not include a `vector` parameter, the `hybrid` search will generate a vector from the query string. If you do include a `vector` parameter, the `hybrid` search will use the vector you provide.) - -[//]: # () -[//]: # (In this way, you may be able to perform a hybrid search where the `bm25` search and the vector search are based on different concepts. For example, you could perform a `bm25` search with the query string `italian`, and have the vector search be based on a vector of `food`.) - -[//]: # () -[//]: # (:::warning) - -[//]: # (TODO - complete this section after I get responses on the slack comment/question) - -[//]: # (:::) - -### `alpha` - -The optional `alpha` parameter determines the weighting of the BM25 search ranking and the vector search ranking. If you do not include an `alpha` parameter, the `hybrid` search will use a default value of `0.5`, which weights each equally. - -Otherwise, an `alpha` value of 1 is the same as a pure vector search, whereas an `alpha` value of 0 is the same as a pure BM25 search. - -:::tip Exercise -Try varying the `alpha` parameter above. What happens to the results? -::: - -## Review - - - -### Review exercise - -### Key takeaways - -- A hybrid search combines `bm25` search with vector search, producing rankings from a combination of the two results. -- Hybrid search is helpful when a vector search or a keyword search alone is not producing desired results. -- Hybrid search orders its search results by summing the inverse of the vector and `bm25` rankings. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const hybridRankingQuestion = [{ - questionText: 'How do hybrid searches order its search results?', - answerOptions: [ - { - answerText: 'By multiplying the vector similarity with the BM25 score', - isCorrect: false, - feedback: 'It does not do that, unfortunately.', - }, - { - answerText: 'By averaging the vector and BM25 search rankings', - isCorrect: false, - feedback: 'It does not do that, unfortunately.', - }, - { - answerText: 'By summing the inverse of the vector and BM25 rankings', - isCorrect: true, - feedback: 'So it has the effect of rewarding results that score high in at least one of the searches.', - }, - ] -}]; diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/30_generative.mdx b/docs/academy/py/zero_to_mvp/104_queries_2/30_generative.mdx deleted file mode 100644 index 6ccf196f8..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/30_generative.mdx +++ /dev/null @@ -1,242 +0,0 @@ ---- -title: Generative searches -description: Learn to use generative AI for crafting advanced queries with Weaviate's Python SDK. ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/30_generative.py'; - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -[//]: # (Image alt) - -## Overview - -The core concept behind generative search is that instead of just simply fetching data from the database, Weaviate can transform the data before delivering it to you. - -This ability to transform data makes generative search a powerful tool that can transform your relationship with data. Instead of a database simply being a store of information, it can work with your data to deliver outputs based on the data and a set of instructions. - -## Configuration - -### Enable modules - -To use generative search, a `generative-xxx` module must be enabled in the Weaviate instance. - -If you are using WCD, generative modules are enabled by default ([see docs](/cloud)). Otherwise, you must configure your Weaviate instance to make sure that a generative module is enabled. - -### Configure classes - -If only one generative module is enabled for the Weaviate instance, Weaviate will automatically use that module for all generative tasks. - -On the other hand, if multiple generative modules are configured, you must define for each class which generative model to use, such as shown below. - -```json -{ - "classes": [ - { - "class": "JeopardyQuestion", - "moduleConfig": { - "generative-openai": { - "model": "gpt-3.5-turbo" - } - } - } - ] -} -``` - -## About generative searches - -### How it works - -A generative search can be best thought of as two steps that are conveniently combined to one process. The two steps are to: -1. Perform a search; and then -1. Call a generative model using the search results and a user-provided prompt. - -All generated outputs are then added to the search results before being returned to the user. - -There are two different types of generative searches available, which are `single prompt` and `grouped task`. - -- A `single prompt` search generates **a response for each result** in the results set, using each corresponding result with the user-provided prompt. -- A `grouped task` search generates **one response for the whole result set**, by using the entire results set with the user-provided prompt. - -### Generative search syntax - -A generative search adds a step to the search process. Accordingly, the syntax for a generative search requires specifying the prompt type (`single prompt` or `grouped task`) as well as a search query. - -#### Single prompt - -To carry out a single prompt search, you must provide a prompt that contains at least one object property. The provided properties will be populated by Weaviate based on the search results. - -A single prompt search can be carried out as follows: - - - - - - - -The above query searches for top 3 `Article` objects using a `nearText` similarity search. Then, Weaviate sends each result to a generative model with the provided prompt, whose outputs are returned to the user. - -A `single prompt` search will generate an output for each object found in search. Accordingly, each object in the response will include the generated output in the `_additional` properties set. - -
- See the JSON response - - - -
- -#### Grouped Task - -A grouped task search does not require any properties to be specified. It can be carried out as follows: - - - - - - - -The above query searches for top 3 `Article` objects using a `nearText` similarity search, just as we did above. In this case, however, the search results are concatenated and sent to the generative model along with the user-provided prompt. - -In other words, each `grouped task` search will invoke one generative output per task. - -Thus, a `grouped task` search will generate only one output for the entire task. The generated output is returned as a part of the first object in the `_additional` properties set. - -
- See the JSON response - - - -
- -## Object properties - -You already saw that `single prompt` generative searches require properties to be specified. You can also specify specific properties to be used with `grouped task` searches. - -### Why specify properties? - -In `single prompt` examples, each property serves to form the model prompt, replacing placeholder text like (`{summary}`) with the retrieve summary text. - -You can also specify the properties to be used for each `grouped task`, so that the generative model only receives the data you want to pass to it. - -#### Context window length - -Another reason for specifying the properties is to reduce the chance of exceeding the model context length. Generative models are typically transformer-based, and many have a limited context window. - -As a result, only passing the required properties to the generative module may allow you to include results from more objects without exceeding the context window limit. - -### How to specify properties - -:::tip Generative search properties unrelated to returned properties -In both `single prompt` and `grouped task` searches, the properties to be used in the generative search do not need to be specified in the properties to be returned. -::: - -You saw earlier that for `single prompt` searches, you can specify in the prompt the specific properties to be used. - -For `grouped task` searches, the properties to be used must be passed as an additional parameter. Take a look at the following example, where we ask the generative model to simply repeat the provided prompt: - - - - - - - - -
- See the JSON response - - - -
- -## Generative search parameters - -### Optional model parameters - -Generative modules use what are called "large language models", or LLMs, to produce these outputs from search results and prompts. - -Weaviate allows you to tune the behavior of these models through additional, optional parameters made available by their providers. - -While we cannot cover every single parameter, some groups of commonly available parameters are described below. - -:::tip Parameter names will vary -Each model will use different parameter names, so you should consult the exact module documentation and the corresponding model documentation. -::: - -- `model`: Determines the actual language model to be used. -- `temperature`, `k` and `p`: Determine how "random" the model will behave. At one extreme, the model will behave deterministically, whereas at the other extreme, it will produce more unpredictable (potentially incoherent) outputs. Some providers such as OpenAI recommend using only one of these settings. -- `max_tokens`: Determines the maximum length of a generated output. The longer the value, the more likely you are to potentially exceed the context limit in combination with the input length. -- `xxx_penalty`: Determines how much to penalise certain aspects such as the same tokens appearing again, or the number of times that the same tokens appear. - -This example specifies various parameters for the `generative-openai` module: - -```json -{ - "classes": [ - { - "class": "JeopardyQuestion", - "moduleConfig": { - "generative-openai": { - "model": "gpt-3.5-turbo", - "temperatureProperty": 0.3, - "maxTokensProperty": 512, - "frequencyPenaltyProperty": 0.1, - "presencePenaltyProperty": -0.1 - } - } - } - ] -} -``` - -As a starting point, we recommend that you try using the default options, including the model, if possible. Then if something is not working to your satisfaction, you could try a different model parameter or approach. - -## Review - -### Key takeaways - -- Generative search transforms data before delivery, turning a database into a more active participant in data processing. -- To use generative search, a generative module must be enabled in the Weaviate instance; generative modules are enabled by default in WCD. -- A generative search involves performing a search and then calling a generative model using the search results and a user-provided prompt. -- Single prompt search generates a response for each result, while grouped task search generates one response for the whole result set. -- Object properties are used in generative searches; they form the model prompt in single prompt examples and can be specified in grouped tasks. -- Generative modules use large language models (LLMs), and Weaviate exposes optional parameters for tuning their behavior. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/90_wrap_up.mdx b/docs/academy/py/zero_to_mvp/104_queries_2/90_wrap_up.mdx deleted file mode 100644 index 05abd5bd6..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/90_wrap_up.mdx +++ /dev/null @@ -1,26 +0,0 @@ ---- -title: Wrap-up -description: Query Wrap-up - Part 2 ---- - -## Unit review - -In this unit, you learned about additional types of queries, including keyword (BM25), hybrid, generative and QnA searches. Combined with the tools that you learned earlier in [Queries 1](../102_queries_1/index.mdx), you are now armed with a variety of search tools. - -Each of these tools allow you to interrogate the data differently. Being aware of how each type works will allow you to formulate the right query for your tasks. - -BM25 searches for example can be used for precise keyword searches, and a hybrid search allows you to combine results of both BM25 and vector searches. Meanwhile, generative searches and QnA searches transform your data before it is delivered to the user. These searches leverage the power of language models, bringing your data to life and allowing Weaviate to go further than simple data retrieval. - -### Learning outcomes - -Having finished this unit, you should be able to: -- Perform BM25 and hybrid searches. -- Differentiate between vector, BM25 and hybrid searches. -- Transform data before delivery with generative searches. -- Extract answers from data with QnA searches. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/_40_qna.mdx b/docs/academy/py/zero_to_mvp/104_queries_2/_40_qna.mdx deleted file mode 100644 index eab2a6391..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/_40_qna.mdx +++ /dev/null @@ -1,152 +0,0 @@ ---- -title: Question and Answering (QnA) ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; -import PythonCodeExample from '!!raw-loader!./_snippets/40_qna.py'; - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -[//]: # (Image alt) - -## Overview - -A question and answering (QnA) module does exactly what its name suggests. It will answer a question from you, based on the data present in the search results. - -This is a more specific type of data transformation than one performed by a generative module. A QnA module looks to extract an answer to the question from the search data. - -To this end, a QnA module may not provide an answer at all. - -## Configuration - -### Enable modules - -To use QnA functionality, a `qna-xxx` module must be enabled in the Weaviate instance. - -If you are using WCD, a `qna` module is enabled by default ([see the documentation](/cloud)). Otherwise, you must configure your Weaviate instance to make sure that a `qna-xxx` module is enabled. - -This is outside the scope of this unit, but you can refer to the [module configuration](/weaviate/modules/reader-generator-modules/index.md) for information on how to configure each module. - -### Configure classes - -If only one qna module is enabled for the Weaviate instance, Weaviate will automatically use that module for all qna tasks. - -On the other hand, if multiple qna modules are configured, you must define for each class which qna model to use, such as shown below. - -```json -{ - "classes": [ - { - "class": "JeopardyQuestion", - "moduleConfig": { - "qna-openai": { - "model": "gpt-3.5-turbo-instruct" - } - } - } - ] -} -``` - -Use of QnA functionality also requires that the target class be configured with a vectorizer (`text2vec`) module. - -## About QnA queries - -### How it works - -Similarly to generative modules, a QnA search involves two steps, which are to: -1. Perform a search; and then -1. Attempt to extract an answer using the search results and the question. - -Depending on whether a suitable answer was found, the `answer` sub-property under `_additional` properties may contain the answer. Some models will also return the position of the answer in the text. - -If an answer is not found, the `answer` sub-property will indicate so. - -### QnA syntax - -In a generative search, the same question text is used to both: -- Perform the search, and -- Extract the answer. - -So, in the example below, the query `"How many championships does Lewis Hamilton have?"` is used to find the nearest object, from which Weaviate attempts to extract an answer to the question. - - - - - - - -In this query, the article includes an answer, and you can see that it has been extracted by the model. - -
- See the JSON response - - - -
- -### How to specify properties - -You can specify the object properties in which the QnA module is to search through for the answer. Similarly to the case of a generative query, this may be useful if you want to reduce the length of the input, or be very specific about where the information should come from. - -:::note `nearText` search unaffected -Specifying properties to search only affects the answer extraction part of the query, as the underlying object vectors do not change. -::: - - - - - - - -And in this query, Weaviate is not able to answer the question as the required information is not available in the `title` field. - -
- See the JSON response - - - -
- -### Object limits - -As this is also a two-step search, you can specify the number of objects for Weaviate to initially retrieve before attempting to extract the answer. - -By setting the number of objects, you may increase the chance of retrieving the object that contains the answer to the specific question. - -## Review - -### Key takeaways - -- Question and Answer (QnA) search is another two-step search, which attempts to extract an answer to a specific question from the retrieved data before delivery. -- To perform a QnA search, a `qna-xxx` module must be enabled in the Weaviate instance. The `qna-openai` module is enabled by default in WCD. -- The QnA module will look for an answer in each retrieved object, returning the answer as an additional property. -- If the QnA module does not identify an answer, it will indicate so in the response. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/10_bm25.py b/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/10_bm25.py deleted file mode 100644 index 264330d35..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/10_bm25.py +++ /dev/null @@ -1,152 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate -import json - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# ===== Generic BM25 Query ===== - -# GenericBM25Query -response = ( - client.query - .get("JeopardyQuestion", ["question", "answer"]) - # highlight-start - .with_bm25( - query="food", # Query string - properties=["question^2", "answer"] # Searched properties, including boost for `question` - ) - .with_additional("score") # Include score in the response - # highlight-end - .with_limit(3) - .do() - ) - -print(json.dumps(response, indent=2)) -# END GenericBM25Query - -# Tests -assert "JeopardyQuestion" in response["data"]["Get"] -assert len(response["data"]["Get"]["JeopardyQuestion"]) == 3 -assert response["data"]["Get"]["JeopardyQuestion"][0].keys() == {"question", "answer", "_additional"} -assert response["data"]["Get"]["JeopardyQuestion"][0]["_additional"].keys() == {"score"} -# End test - - -expected_response = ( -# Expected GenericBM25Query results -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "score": "4.0038033" - }, - "answer": "cake", - "question": "Devil's food & angel food are types of this dessert" - }, - { - "_additional": { - "score": "3.8706005" - }, - "answer": "a closer grocer", - "question": "A nearer food merchant" - }, - { - "_additional": { - "score": "3.2457707" - }, - "answer": "food stores (supermarkets)", - "question": "This type of retail store sells more shampoo & makeup than any other" - } - ] - } - } -} -# END Expected GenericBM25Query results -) - -# ===== BM25 - `word` Tokenization Example ===== - -# BM25WithWordTokenization -response = ( - client.query - .get( - class_name="JeopardyQuestion", - properties=["question", "round"] - ) - .with_bm25( - "Jeopardy", - properties=["question"] - ) - .with_limit(2) - .do() -) - -print(json.dumps(response, indent=2)) -# END BM25WithWordTokenization - -expected_response = ( -# Expected BM25WithWordTokenization results -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "question": "Capistrano swallows, Undeliverable mail, \"Jeopardy!\" champs", - "round": "Jeopardy!" - }, - { - "question": "This opera star & \"Celebrity Jeopardy!\" contestant began life as Belle Silverman", - "round": "Double Jeopardy!" - } - ] - } - } -} -# END Expected BM25WithWordTokenization results -) - -assert response == expected_response - - -# ===== BM25 - `field` Tokenization Example ===== - -# BM25WithFieldTokenization -response = ( - client.query - .get( - class_name="JeopardyQuestion", - properties=["question", "round"] - ) - .with_bm25( - "Jeopardy", - properties=["round"] - ) - .with_limit(2) - .do() -) - -print(json.dumps(response, indent=2)) -# END BM25WithFieldTokenization - -expected_response = ( -# Expected BM25WithFieldTokenization results -{ - "data": { - "Get": { - "JeopardyQuestion": [] - } - } -} -# END Expected BM25WithFieldTokenization results -) - -assert response == expected_response - diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/20_hybrid.py b/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/20_hybrid.py deleted file mode 100644 index 9bb12310c..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/20_hybrid.py +++ /dev/null @@ -1,77 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate -import json - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# ===== Generic Hybrid Query ===== - -# GenericHybridQuery -response = ( - client.query - .get("JeopardyQuestion", ["question", "answer"]) - # highlight-start - .with_hybrid( - query="food", # Query string - properties=["question", "answer"], # Searched properties - vector=None # Manually provide a vector; if not, Weaviate will vectorize the query string - ) - .with_additional(["score", "explainScore"]) # Include score & explainScore in the response - # highlight-end - .with_limit(3) - .do() - ) - -print(json.dumps(response, indent=2)) -# END GenericHybridQuery - -# Tests -assert "JeopardyQuestion" in response["data"]["Get"] -assert len(response["data"]["Get"]["JeopardyQuestion"]) == 3 -assert response["data"]["Get"]["JeopardyQuestion"][0].keys() == {"question", "answer", "_additional"} -assert response["data"]["Get"]["JeopardyQuestion"][0]["_additional"].keys() == {"score", "explainScore"} -# End test - - -expected_response = ( -# Expected GenericHybridQuery results -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "explainScore": "(bm25)\n(hybrid) Document df958a90-c3ad-5fde-9122-cd777c22da6c contributed 0.003968253968253968 to the score\n(hybrid) Document df958a90-c3ad-5fde-9122-cd777c22da6c contributed 0.012295081967213115 to the score", - "score": "0.016263336" - }, - "answer": "a closer grocer", - "question": "A nearer food merchant" - }, - { - "_additional": { - "explainScore": "(vector) [0.022335753 -0.027532013 -0.0061008437 0.0023294748 -0.00041679747 -0.007862403 -0.018513374 -0.037407625 -0.004291675 -0.012575763]... \n(hybrid) Document ec776112-e651-519d-afd1-b48e6237bbcb contributed 0.012096774193548387 to the score", - "score": "0.012096774" - }, - "answer": "Famine", - "question": "From the Latin for \"hunger\", it's a period when food is extremely scarce" - }, - { - "_additional": { - "explainScore": "(vector) [0.022335753 -0.027532013 -0.0061008437 0.0023294748 -0.00041679747 -0.007862403 -0.018513374 -0.037407625 -0.004291675 -0.012575763]... \n(hybrid) Document 98807640-cd16-507d-86a1-801902d784de contributed 0.011904761904761904 to the score", - "score": "0.011904762" - }, - "answer": "Tofu", - "question": "A popular health food, this soybean curd is used to make a variety of dishes & an ice cream substitute" - } - ] - } - } -} -# END Expected GenericHybridQuery results -) diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/30_generative.py b/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/30_generative.py deleted file mode 100644 index 49bc178fb..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/30_generative.py +++ /dev/null @@ -1,207 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate -import json - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# ===== Basic Single Prompt Generative Query ===== - -# SinglePromptQuery -response = ( - client.query - .get("Article", ["title", "summary"]) - .with_near_text({"concepts": ["housing prices"]}) - .with_limit(3) - # highlight-start - .with_generate( - single_prompt="Provide a two-bullet point summary of the article, whose title is {title} and body {summary}" - ) - # highlight-end - .do() -) - -print(json.dumps(response, indent=2)) -# END SinglePromptQuery - -# Tests -assert "Article" in response["data"]["Get"] -assert len(response["data"]["Get"]["Article"]) == 3 -assert response["data"]["Get"]["Article"][0].keys() == {"title", "summary", "_additional"} -assert response["data"]["Get"]["Article"][0]["_additional"].keys() == {"generate"} -assert "singleResult" in response["data"]["Get"]["Article"][0]["_additional"]["generate"].keys() -# End test - -expected_response = """ -# Expected SinglePromptQuery results -{ - "data": { - "Get": { - "Article": [ - { - "_additional": { - "generate": { - "error": null, - "singleResult": "- Real house prices have risen by 5% on average in the latest 12-month period among the 25 countries tracked by The Economist, the quickest in over a decade.\n- House prices in suburban locations are rising faster than in cities due to the expectation that commuting may no longer be daily, reversing a decade-long trend." - } - }, - "summary": "As restrictions have eased, house prices have started to go through the roof. Among the 25 countries that The Economist tracks, real house prices rose by 5% on average in the latest 12-month period, the quickest in over a decade. The expectation that commuting may no longer be daily has caused house prices in suburban locations to rise faster than in cities\u2014reversing a decade-long trend. One reason is that house prices do not look as if they threaten financial stability. If house prices in America fell by one-quarter, its 33 biggest banks would still have 50% more capital than they had going into the crisis of 2007-09.", - "title": "House prices are going ballistic" - }, - { - "_additional": { - "generate": { - "error": null, - "singleResult": "- Politicians are taking action to address housing shortages, and consultants in Auckland have detected a genuine interest in boosting housing supply.\n- Evidence suggests that autocratic planning systems may be more effective at increasing housing supply, as seen in Switzerland where house prices have risen less than in any other rich country." - } - }, - "summary": "Consultants to the government in Auckland detect a genuine interest in boosting housing supply. The part of the country with the most elastic housing supply, Pine Bluff, a midsized city in Arkansas, has an average house price of $90,000. Some evidence seems to back up the view that economists\u2019 obsession with housing supply is misguided. Autocratic planning systems do a better job of boosting housing supply. In the past century Swiss house prices have risen by less than those in any other rich country.", - "title": "Supply - Politicians are finally doing something about housing shortages" - }, - { - "_additional": { - "generate": { - "error": null, - "singleResult": "- American house prices rose by 11% in the year to January, the fastest pace for 15 years, and real house prices have risen by an average of 5% across the 25 countries tracked by The Economist.\n- House prices outside Germany's seven biggest cities rose by 11% last year, compared with 6% within them, while house prices in central London and Sydney rose by just 4% and 3% last year, respectively, and those in Manhattan fell by 4%." - } - }, - "summary": "American house prices rose by 11% in the year to January, the fastest pace for 15 years. Across the 25 countries tracked by The Economist, real house prices have risen by an average of 5% in the latest 12-month period. At first glance, the robustness of house prices in the face of the economic turmoil inflicted by covid-19 might seem baffling: property prices typically move in tandem with the economy. House prices outside Germany\u2019s seven biggest cities rose by 11% last year, compared with 6% within them. By contrast, house prices in central London and Sydney rose by just 4% and 3% last year, respectively; those in Manhattan fell by 4%.", - "title": "House prices in the rich world are booming" - } - ] - } - } -} -# END Expected SinglePromptQuery results -""" - -# ===== Basic Grouped Generative Query ===== - -# GroupedTaskQuery -response = ( - client.query - .get("Article", ["title", "summary"]) - .with_near_text({"concepts": ["housing prices"]}) - .with_limit(3) - # highlight-start - .with_generate( - grouped_task="Provide any common threads between these articles, if any" - ) - # highlight-end - .do() -) - -print(json.dumps(response, indent=2)) -# END GroupedTaskQuery - -# Tests -assert "Article" in response["data"]["Get"] -assert len(response["data"]["Get"]["Article"]) == 3 -assert response["data"]["Get"]["Article"][0].keys() == {"title", "summary", "_additional"} -assert response["data"]["Get"]["Article"][0]["_additional"].keys() == {"generate"} -assert "groupedResult" in response["data"]["Get"]["Article"][0]["_additional"]["generate"].keys() -# End test - -expected_response = """ -# Expected GroupedTaskQuery results -{ - "data": { - "Get": { - "Article": [ - { - "_additional": { - "generate": { - "error": null, - "groupedResult": "All three articles discuss the recent rise in house prices in various countries, with a focus on the impact of the COVID-19 pandemic on the housing market. The articles also touch on the factors driving the increase in prices, such as changes in commuting patterns and supply shortages. Additionally, the articles mention the potential risks and concerns associated with the rapid rise in house prices, including the threat to financial stability and the impact on affordability for buyers." - } - }, - "summary": "As restrictions have eased, house prices have started to go through the roof. Among the 25 countries that The Economist tracks, real house prices rose by 5% on average in the latest 12-month period, the quickest in over a decade. The expectation that commuting may no longer be daily has caused house prices in suburban locations to rise faster than in cities\u2014reversing a decade-long trend. One reason is that house prices do not look as if they threaten financial stability. If house prices in America fell by one-quarter, its 33 biggest banks would still have 50% more capital than they had going into the crisis of 2007-09.", - "title": "House prices are going ballistic" - }, - { - "_additional": { - "generate": null - }, - "summary": "Consultants to the government in Auckland detect a genuine interest in boosting housing supply. The part of the country with the most elastic housing supply, Pine Bluff, a midsized city in Arkansas, has an average house price of $90,000. Some evidence seems to back up the view that economists\u2019 obsession with housing supply is misguided. Autocratic planning systems do a better job of boosting housing supply. In the past century Swiss house prices have risen by less than those in any other rich country.", - "title": "Supply - Politicians are finally doing something about housing shortages" - }, - { - "_additional": { - "generate": null - }, - "summary": "American house prices rose by 11% in the year to January, the fastest pace for 15 years. Across the 25 countries tracked by The Economist, real house prices have risen by an average of 5% in the latest 12-month period. At first glance, the robustness of house prices in the face of the economic turmoil inflicted by covid-19 might seem baffling: property prices typically move in tandem with the economy. House prices outside Germany\u2019s seven biggest cities rose by 11% last year, compared with 6% within them. By contrast, house prices in central London and Sydney rose by just 4% and 3% last year, respectively; those in Manhattan fell by 4%.", - "title": "House prices in the rich world are booming" - } - ] - } - } -} -# END Expected GroupedTaskQuery results -""" - -# ===== Grouped Generative Query w/ Specific properties ===== - -# GroupedTaskWithProperties -response = ( - client.query - .get("Article", ["title"]) - .with_near_text({"concepts": ["housing prices"]}) - .with_limit(3) - # highlight-start - .with_generate( - grouped_task="Repeat the provided prompt, exactly", - grouped_properties=["title"] - ) - # highlight-end - .do() -) - -print(json.dumps(response, indent=2)) -# END GroupedTaskWithProperties - -# Tests -assert "Article" in response["data"]["Get"] -assert len(response["data"]["Get"]["Article"]) == 3 -assert response["data"]["Get"]["Article"][0].keys() == {"title", "_additional"} -assert response["data"]["Get"]["Article"][0]["_additional"].keys() == {"generate"} -assert "groupedResult" in response["data"]["Get"]["Article"][0]["_additional"]["generate"].keys() -# End test - -expected_response = """ -# Expected GroupedTaskWithProperties results -{ - "data": { - "Get": { - "Article": [ - { - "_additional": { - "generate": { - "error": null, - "groupedResult": "[{\"title\":\"House prices are going ballistic\"},{\"title\":\"Supply - Politicians are finally doing something about housing shortages\"},{\"title\":\"House prices in the rich world are booming\"}]" - } - }, - "title": "House prices are going ballistic" - }, - { - "_additional": { - "generate": null - }, - "title": "Supply - Politicians are finally doing something about housing shortages" - }, - { - "_additional": { - "generate": null - }, - "title": "House prices in the rich world are booming" - } - ] - } - } -} -# END Expected GroupedTaskWithProperties results -""" diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/40_qna.py b/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/40_qna.py deleted file mode 100644 index 1b8006648..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/_snippets/40_qna.py +++ /dev/null @@ -1,112 +0,0 @@ -# ===== Instantiate Weaviate client w/ auth config ===== -import weaviate -import json - -client = weaviate.Client( - url="https://WEAVIATE_INSTANCE_URL", # Replace with your Weaviate endpoint - auth_client_secret=weaviate.auth.AuthApiKey(api_key="YOUR-WEAVIATE-API-KEY"), # Replace with your API Key for the Weaviate instance. Delete if authentication is disabled. - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY", - }, -) - -# ===== SimpleQnA Query ===== - -# SimpleAskQuery -ask = { - "question": "How many championships does Lewis Hamilton have?", -} - -response = ( - client.query - .get("WikiArticle", ["title", "_additional {answer {hasAnswer property result startPosition endPosition} }"]) - .with_ask(ask) - .with_limit(1) - .do() -) - -print(json.dumps(response, indent=2)) -# END SimpleAskQuery - -# Tests -assert "WikiArticle" in response["data"]["Get"] -assert len(response["data"]["Get"]["WikiArticle"]) == 1 -assert response["data"]["Get"]["WikiArticle"][0].keys() == {"title", "_additional"} -assert response["data"]["Get"]["WikiArticle"][0]["_additional"].keys() == {"answer"} -# End test - -""" -# Expected SimpleAskQuery results -{ - "data": { - "Get": { - "WikiArticle": [ - { - "_additional": { - "answer": { - "endPosition": 0, - "hasAnswer": true, - "property": "", - "result": " Lewis Hamilton has seven World Drivers' Championship titles.", - "startPosition": 0 - } - }, - "title": "Lewis Hamilton" - } - ] - } - } -} -# END Expected SimpleAskQuery results -""" - -# ===== QnA Query with Properties ===== - -# AskQueryWithProperties -ask = { - "question": "How many championships does Lewis Hamilton have?", - "properties": ["title"] -} - -response = ( - client.query - .get("WikiArticle", ["title", "_additional {answer {hasAnswer property result startPosition endPosition} }"]) - .with_ask(ask) - .with_limit(1) - .do() -) - -print(json.dumps(response, indent=2)) -# END AskQueryWithProperties - -# Tests -assert "WikiArticle" in response["data"]["Get"] -assert len(response["data"]["Get"]["WikiArticle"]) == 1 -assert response["data"]["Get"]["WikiArticle"][0].keys() == {"title", "_additional"} -assert response["data"]["Get"]["WikiArticle"][0]["_additional"].keys() == {"answer"} -# End test - -""" -# Expected AskQueryWithProperties results -{ - "data": { - "Get": { - "WikiArticle": [ - { - "_additional": { - "answer": { - "endPosition": 0, - "hasAnswer": true, - "property": "", - "result": " Lewis Hamilton has seven World Drivers' Championship titles.", - "startPosition": 0 - } - }, - "title": "Lewis Hamilton" - } - ] - } - } -} -# END Expected AskQueryWithProperties results -""" diff --git a/docs/academy/py/zero_to_mvp/104_queries_2/index.mdx b/docs/academy/py/zero_to_mvp/104_queries_2/index.mdx deleted file mode 100644 index a940d73f9..000000000 --- a/docs/academy/py/zero_to_mvp/104_queries_2/index.mdx +++ /dev/null @@ -1,51 +0,0 @@ ---- -title: P3_104 Queries 2 (Python) -description: Continue exploring Weaviate queries with advanced techniques and custom filters. -sidebar_position: 104 ---- - -import ZeroToMvpDeprecationNote from '/docs/academy/py/zero_to_mvp/_snippets/deprecation.md'; - - - -## Unit overview - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -[//]: # (Image alt) - - - -In this unit, you will learn about even more types of queries that you can run with Weaviate. - -We will start by looking how to use keyword searches with the BM25F ranking algorithm, as well as hybrid searches that combines keyword searches with vector searches. - -Then, we will look at generative searches and question and answer (QnA) searches. These searches allow you to not only retrieve data objects, but to transform the results before returning them to you. - -By the end of this unit, you will have a suite of query types to choose from, so that you can retrieve the right information in the form that you want. - -Let's get started. - -### Prerequisites - -- (**Required**) A Python (3) environment with `weaviate-client` installed. -- (**Required**) Complete [101A Weaviate Academy Preparation](../setup.mdx) -- (*Recommended*) Complete [Hello, Weaviate](../101_hello_weaviate/index.mdx) -- (*Recommended*) Complete [Queries 1](../102_queries_1/index.mdx) -- (*Recommended*) Complete [Schema and Imports](../103_schema_and_imports/index.mdx) - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_10_body.mdx b/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_10_body.mdx deleted file mode 100644 index bde2962e6..000000000 --- a/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_10_body.mdx +++ /dev/null @@ -1,83 +0,0 @@ ---- -title: Academy main body example ---- - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -Image alt - -## TOP LEVEL HEADING - -Some text - -### Theory subhead -### Practical subhead - -Some text - -### Theory subhead -### Practical subhead - -Some text - -## TOP LEVEL HEADING - -Some text - -### Theory subhead -### Practical subhead - -Some text - -### Theory subhead -### Practical subhead - -Some text - - - -## Review - - - -Any quiz questions - -### Review exercise - -:::note Exercise -Try out ... -::: - -### Key takeaways - -:::info -Add summary -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -const varName = [{ - questionText: 'questionText', - answerOptions: [ - { - answerText: 'answerOne', - isCorrect: false, - feedback: 'feedbackOne', - }, - { - answerText: 'answerTwo', - isCorrect: false, - feedback: 'feedbackTwo', - }, - { - answerText: 'answerThree', - isCorrect: false, - feedback: 'feedbackThree', - }, - ] -}]; \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_90_wrap_up.mdx b/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_90_wrap_up.mdx deleted file mode 100644 index 01d547c13..000000000 --- a/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_90_wrap_up.mdx +++ /dev/null @@ -1,17 +0,0 @@ ---- -title: Wrap-up ---- - -## Unit review - -In this unit, you have ... -### Learning outcomes - -Now, you should be able to: -... - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_index.mdx b/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_index.mdx deleted file mode 100644 index 414e3bd04..000000000 --- a/docs/academy/py/zero_to_mvp/_000_template/_TEMPLATE_index.mdx +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: TITLE -sidebar_position: 101 ---- - - - - -:::warning TODO -Intro video here -::: - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/50_cross_references.mdx b/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/50_cross_references.mdx deleted file mode 100644 index 7f861c56d..000000000 --- a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/50_cross_references.mdx +++ /dev/null @@ -1,29 +0,0 @@ ---- -title: Cross references ---- - -## Cross references - -### about them - -### how to set them - -### how import w them - -### Query examples - -## Review - -### Review exercise - -:::caution TODO -Add review exercises -::: - -### Key takeaways - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/70_import_example_2.mdx b/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/70_import_example_2.mdx deleted file mode 100644 index 1a69552bc..000000000 --- a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/70_import_example_2.mdx +++ /dev/null @@ -1,32 +0,0 @@ ---- -title: Examples ---- - -## Case studies - -### Dataset used -Here, we will use a [small portion](https://github.com/weaviate-tutorials/edu-datasets/blob/main/winemag_tiny.csv) of the dataset of [wine reviews from Kaggle](https://www.kaggle.com/datasets/zynicide/wine-reviews). - -This tiny dataset consists of 50 rows of wine reviews from around the world. We will build a schema - -### Explore vectorization options - -### Explore indexing options - -### Image vectorization - -## Review - -### Review exercise - -:::caution TODO -Add review exercises -::: - -### Key takeaways - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/index.mdx b/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/index.mdx deleted file mode 100644 index 404feaedf..000000000 --- a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/index.mdx +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: 103 Vectorization Essentials -sidebar_position: 103 ---- - -## Unit overview - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -Image alt - - - -Welcome! This unit builds on the previous introductory unit on queries to provide more context and background on the theory, including a look under the hood. - -We will begin by talking about why vector searches work, and what helps them to work so well. This discussion will touch on deep learning models that power vector searches, including why they are so important. This unit will also show you how Weaviate turns data into vectors, as well as how any why you could customize this behavior. We will also discuss groups of vectorizers that are available with Weaviate and rules of thumb on vectorizer selection, so that you can make good, informed decisions to get started with. - -### Prerequisites - -- (**Required**) A running instance of Weaviate - - We recommended a `sandbox` instance in [Weaviate Cloud](/cloud) - - With sample dataset "JeopardyQuestions" imported (from [`weaviate-demo-datasets`](https://pypi.org/project/weaviate-demo-datasets/)) -- (*Recommended*) Complete [Hello, Weaviate](../101_hello_weaviate/index.mdx) -- (*Recommended*) Complete [Queries 1](../102_queries_1/index.mdx) - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - diff --git a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/intro_vectors.mdx b/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/intro_vectors.mdx deleted file mode 100644 index c268cecee..000000000 --- a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/intro_vectors.mdx +++ /dev/null @@ -1,155 +0,0 @@ ---- -title: Vectors and meaning -sidebar_position: 10 ---- - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -## Vectors and vector search - -Image alt - -So far, we've explained the phrase "vector search" based on what it does - which is a similarity-based search. We've also used it somewhat interchangeably with "semantic search" in this context. Now, let's pause to dig into the details a little more. - -### Vectors and vector search - -![](https://mlabonne.github.io/blog/images/tensor/scalar.png) - -The name "vector search" comes from the fact that we use "vectors" to perform these similarity-based searches. In other words, it is an implementation of a similarity-based search. - -So what, exactly, is a vector? - -A vector in this context is a set of numbers. To be precise, a vector is **an one-dimensional array of numbers**, such as `1, 5, 5`, `0.591, -0.132, 0, 0.105`, or `0, 0, 0, 0, 0, 0`. - -:::note Vector representation -A vector is typically shown as its constituent numbers wrapped in square brackets. So, the above vectors may be typically represented as `[1, 5, 5]`, `[0.591, -0.132, 0, 0.105]` and `[0, 0, 0, 0, 0, 0]` respectively. -::: - -A vector itself is a relatively small concept. It's a collection, or a one-dimensional array, of numbers. This may not seem like much in itself. But the real power of vectors comes from the fact that they can be used to represent meaning. So how can a series of numbers represent meaning? - -This is **the** big idea, and one that perhaps takes the most getting used to, if you are new to the concept. Come with us on this journey, and we'll do our best to explain. - -:::info Vector / tensor / array -If you have seen words like "tensor", or "array", in a similar context they generally refer to the same thing - with perhaps slight differences depending on whether they might be referring to particular software libraries, or the number of dimensions. -::: - -:::tip Key takeaway: -The real power of vectors comes from the fact that they can be used to represent meaning. This insight and development, along with modern developments in deep learning, is what powers vector searches. -::: - -### Everyday examples of vectors - -![](../../tmp_images/a_isometric_view_of_a_world_map_with_a_pin_on_it_0.jpg) -![](https://i.stack.imgur.com/ikf0N.jpg) - -One way to think about it is that each number in a vector represents a particular aspect of meaning. Then, the vector itself represents the combination of all of these aspects. - -In fact, many of you may have already used vectors to describe aspects of meaning. For example, you may have used a vector to: -- Describe the position of an object in space by sending a location pin to somebody. This is a vector that contains numbers for the latitude and longitude, or -- Describe the color of an object by a vector of numbers that represent the red, green and blue (and perhaps transparency) components of the color. - -In each of these examples, each number in the vector represents a particular aspect of the object, such as its longitude or how green the object is. The entire vector combines these aspects to represent the "meaning", such as its location or its color. - -Extending this idea, the above two vectors might be combined to one vector that contains its latitude, longitude, red, green, blue and transparency. This vector would then represent the object's location and color. - -In other words, a vector can represent any property of an object by enumerating a property such as how far north it is, how far west it is, how red it is, how green it is, how blue it is, and how transparent it is. - - - -## Using vectors to represent meaning - -Image alt - -### Why use vectors to represent meaning? - -![](https://miro.medium.com/v2/resize:fit:1400/format:webp/1*SYiW1MUZul1NvL1kc1RxwQ.png) - -The semantic use of vectors extends this idea further to represent meaning of words, sentences, or even entire documents. - -Again, each number in a vector represents a particular aspect of meaning. Imagine how you would try to represent words like "lion", "grapefruit", "cat" and "cow". You might have each numbers within a vector capture different aspects, such as: -- How "furry" the object is -- How "big" the object is -- How "dangerous" the object is -- How "edible" the object is - -And so on. - -Then, the vector itself would represent the combination of all of these aspects. So, a vector for the word "lion" might be `[0.8, 0.9, 0.9, 0.1]`, while a vector for the word "grapefruit" might be `[0.1, 0.3, 0.1, 0.9]`. - -Each number in a vector represents a particular aspect of meaning, and the entire vector represents the combination of all of these aspects. - -A "real" vector that is generated by modern language models are simply longer. A vector that is used to represent the word "lion" might have a much longer length, such as hundreds or thousands of numbers, but the idea is the same. Each number represents some aspect of a meaning, and the resulting entire vector can be used to represent the meaning of the word. - -### How are vectors actually generated? - -![](../../tmp_images/a_machine_with_a_conveyer_belt_producing_many_long_numbers_0.jpg) - -Vectors are generated by machine learning models. These models are trained on large amounts of data, and are able to learn the relationships between words and their meanings. - -These models use the training data to infer relationships between words and their meanings. For example, a model might learn that the word "lion" is similar to the word "tiger" because they are both big cats. It might also learn that the word "lion" is similar to the word "cow" because they are both animals. And then, they use this knowledge to find-tune individual numbers in vectors such that they best represent the meaning of the words. - -This is why choosing the right model is so important. The model you choose will have a significant impact on the quality of the vectors, and what it means to be "similar" to another word. - -A model that is trained based on how words are used in everyday English is going to be different from a model that is trained on how words are used in scientific papers, or in particular domains such as medicine or law. - -We will cover this in more detail elsewhere in the course, in regards to how to choose the right model for your use case. - - - - - -## Review - -:::warning TODO -Video here -::: - -### Review exercise - -Can you describe, in your own sentence, XXX? - -:::warning TODO -Input box for user to put answer in and get back a similarity score & our definition? -?? -::: - -### Key takeaways - -:::info -Add summary -::: - -import Quiz from '/src/components/Academy/quiz.js' -const vectorDefinition = [ - { - questionText: 'What is a vector in the context of data science?', - answerOptions: [ - { answerText: 'An array of letters.', isCorrect: false, feedback: 'It is not an array of letters.'}, - { answerText: 'An array of numbers.', isCorrect: true, feedback: 'Note: Typically they are floating point numbers.'}, - { answerText: 'An array of symbols.', isCorrect: false, feedback: 'It is not an array of symbols.'}, - { answerText: 'A number with direction and magnitude.', isCorrect: false, feedback: 'This is a definition in science or mathematics, but not so much in the data science context.'}, - ], - }, -]; -const vectorsAndMeaning = [ - { - questionText: 'Which of these is not a good example of vectors representing meaning?', - answerOptions: [ - { answerText: 'An RGB value representing color.', isCorrect: false, feedback: 'Actually, this is a good example. An RGB value uses three numbers for (R)ed, (G)reen and (B)lue to represent color.'}, - { answerText: 'A location coordinate representing longitude, latitude and altitude.', isCorrect: false, feedback: 'Actually, this is a good example. These three numbers would accurately and precisely describe the location.'}, - { answerText: 'A 1024-dimensional vector generated by a language model to represent the word "butterfly".', isCorrect: false, feedback: 'Actually, this is a good example. This is in fact a typical way in which modern vector would be generated.'}, - { answerText: 'None, these are all good examples.', isCorrect: true, feedback: 'All of these are perfectly reasonable examples of vectors representing meaning.'}, - ], - }, -]; -const vectorGeneration = [ - { - questionText: 'How are vectors generated?', - answerOptions: [ - { answerText: 'Manually inputting numbers into an array.', isCorrect: false, feedback: 'This might be theoretically possible, but highly impractical.'}, - { answerText: 'Training machine learning models on large amounts of data.', isCorrect: true, feedback: 'This is typically how they are generated.'}, - { answerText: 'Automatically generating random numbers.', isCorrect: false, feedback: 'While this would work, these vectors would not be "meaningful". Interestingly, this is how they are typically "initialized" during training.'}, - { answerText: 'Copying numbers from other vectors.', isCorrect: false, feedback: 'This would work, but would not produce unique vectors.'}, - ], - }, -]; diff --git a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/vector_parameters.mdx b/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/vector_parameters.mdx deleted file mode 100644 index 615a12d1e..000000000 --- a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/vector_parameters.mdx +++ /dev/null @@ -1,120 +0,0 @@ ---- -title: Effective vector search -sidebar_position: 40 ---- - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -## Vector search theory - -Image alt - -### Distance metrics - -## Vectorizer overview - -Image alt - -### Rules of thumb - -### Setting vectorizers - -## Additional search methods - -### Search operators - -Just as you need to provide search terms to perform a lexical search, Weaviate requires a source vector to perform a vector search. - -These can be provided as arguments in GraphQL, using `nearVector`, `nearObject`, or `nearText` for example. - -These are provided to Weaviate as arguments to the class (e.g. `Get`) in the GraphQL query. - -For example, the following query adds a `nearVector` operator to look for similar objects in the `Question` class: - -```graphql -{ - Get { - Question ( - limit: 1, - nearVector: {vector: } - ) { - question - answer - } - } -} - -print(json.dumps(response, indent=2)) -``` - -When a vector for "unusual animals" is used as a the input, the query returns: - -```json -{ - "data": { - "Get": { - "Question": [ - { - "answer": "raccoons", - "question": "The crab-eating one of these masked animals eats fish, fruit & frogs, too" - } - ] - } - } -} -``` - -Search operators can also be used with the `Aggregate` and `Explore` functions. They are key inputs that specify the way in which the vector space is searched. We will review these in more detail in a following section of this unit. - -Image alt - -### `NearObject` - -### `NearVector` - -## Vector search theory - -Image alt - -### Distance metrics - -## Review - -Image alt - -### Review exercise - -Can you describe, in your own sentence, XXX? - -:::warning TODO -Input box for user to put answer in and get back a similarity score & our definition? -?? -::: - -### Key takeaways - -:::info -Add summary -::: - -import Quiz from '/src/components/Academy/quiz.js' -const varName = [{ - questionText: 'questionText', - answerOptions: [ - { - answerText: 'answerOne', - isCorrect: false, - feedback: 'feedbackOne', - }, - { - answerText: 'answerTwo', - isCorrect: false, - feedback: 'feedbackTwo', - }, - { - answerText: 'answerThree', - isCorrect: false, - feedback: 'feedbackThree', - }, - ] -}]; \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/vector_search.mdx b/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/vector_search.mdx deleted file mode 100644 index fd0dbcbdf..000000000 --- a/docs/academy/py/zero_to_mvp/_104_vectorization_essentials/vector_search.mdx +++ /dev/null @@ -1,144 +0,0 @@ ---- -title: Vector search, explained -sidebar_position: 20 ---- - -## About vector search - -:::warning TODO -Intro video here -::: - -### How does vector search work? - -![](../../tmp_images/person_holding_a_magnifying_glass_in_front_of_a_big_pile_of_numbers_1.jpg) - -Earlier on, we mentioned that a vector can be used to represent the meaning of a word. In an earlier section, we also mentioned that a vector search is a fantastic way of carrying out similarity-based searches. - -You can probably see how these two ideas converge. A vector search relies on these numerical representation of meaning to carry out similarity-based searches. The closer two vectors are to each other, the more similar the words they represent. - -A vector search therefore works by providing a vector-based query object to a vector index. The vector index then returns a list of results that are most similar to the query object. - -It may be that a vector index takes as input an object such as a text passage or an image which it converts to a vector. Or it may receive a vector as the query object directly. - -### How is vector similarity measured? - -![](../../tmp_images/scale_of_justice_with_words_on_either_side_0.jpg) - -Broadly speaking, vector similarity measures how close the two sets of numbers that make up each vector are to each other. Depending on your relationship with linear algebra, this may sound like a very abstract concept. - -But the intuition is this: the closer the two vectors are to each other, the more similar the words they represent. - -In practice, there are many different ways to measure vector similarity. The most common methods include: -- Cosine similarity -- Euclidean distance -- Dot product -- Manhattan distance - -The choice of similarity measure will depend on the use case and the model. In Weaviate, the default metric unless specified is `cosine similarity`. We will cover these choices in more detail in another section. - -## What are semantic, and lexical searches? - -:::warning TODO -Intro video here -::: - -### What is a semantic search? - -![](../../tmp_images/confused_people_looking_at_two_similar_piles_of_documents__1.jpg) - -"Semantic" refers to something being "related to meaning". It is often used in the field of linguistics (study of languages) to contrast to "syntactic", which refers to structure or grammar. Thus "semantic search" refers to a search that is based on meaning. - -Vector search is currently the dominant way of performing semantic searches. Because vectors can efficiently and accurate capture meanings of words, sentences and entire documents, they are a great way of carrying out semantic searches. - -We note that "semantic" search also can refer to searches that look for similarities in concepts that are represented by other media such as images, audio or video. As you will see later on, semantic searches refer to comparing particular aspects of these media types, such as the depicted concepts, objects, or tone. - -Accordingly, we will use the terms "semantic search" and "vector search" interchangeably in this course unless otherwise specified. - -### What is a lexical search? - -![](../../tmp_images/person_holding_a_magnifying_glass_in_front_of_a_big_pile_of_words_1.jpg) - -Traditional search systems use "lexical" searches, based on a "vocabulary" of the search system or database at hand. - -A lexical search relies on matching exact strings or substrings and using various operators to manipulate sets of information that either meets or does not meet certain criteria. - -For example, looking through a database of recipes to retrieve all objects containing the ingredient "chili" is a lexical search. You might typically combine multiple conditions. The following query looks for entries with "chili" that where added after the year 2010. - -```sql -SELECT * FROM recipe_table WHERE (ingredients LIKE '%chili%') AND (entry_year > 2010) -``` - -While this enquiry will return an exact answer, it suffers from being inflexible. - -For one, it would not retrieve recipes where another string is used (e.g. `ghost pepper`) that is synonymous (has the same meaning) with chili. It would also not handle typos well, for example if the entry was mislabelled as `chilli`, or if the entry was mistyped as such. - -On the other hand, semantic searches manage these issues with a little more nuance. - -:::note Jargon ("lexical", "semantic", "syntactic") -We try to avoid jargon unless necessary. But where we do use them, we will include explanations for those who have not encountered them. - -Because vector search as we use them has some roots in *natural language processing*, terms like *lexical* and *semantic* are commonly used. These are terms that come from linguistics which are used commonly in vector search context. The term *vector* comes from the world of mathematics, as you will see later on. 🤓 -::: - -### Vector search vs lexical searches - -![](../../tmp_images/person_holding_a_magnifying_glass_in_front_of_a_big_pile_of_numbers_1.jpg) -![](../../tmp_images/person_holding_a_magnifying_glass_in_front_of_a_big_pile_of_words_1.jpg) - -When facing a query of a recipes database with the word `chili` for ingredients, a vector database returns entities that include that exact word in the recipe data. - -This means that a search for `chili` would include in its results entities with semantically related terms such as `ghost pepper`, `jalapeno`, `habanero` or `carolina reaper`, all of which would be ignored by a lexical search. - -A modern vector search can also take the context into account, such as the context of a word as a part of its parent sentence. So a vector search will differentiate between whether the recipe calls for these ingredients, or whether the recipe asks the cook to "*not* use any chili". - -Also importantly, a vector database can determine the degree of similarity between items, such as between a query and each result. Accordingly, the results can be ranked by a meaningful metric that describes the degree of similarity, thereby improving the chance of finding more relevant results faster. - - - - - -## Review - -:::warning TODO -Video here -::: - -### Review exercise - -Can you describe, in your own sentence, XXX? - -:::warning TODO -Input box for user to put answer in and get back a similarity score & our definition? -?? -::: - -### Key takeaways - -:::info -Add summary -::: - -import Quiz from '/src/components/Academy/quiz.js' -const lexicalSearch = [ - { - questionText: 'What type of search is based on word or token matches?', - answerOptions: [ - { answerText: 'Lexical search.', isCorrect: true, feedback: 'They are called this because they are "lexical" or vocabulary-based searches.'}, - { answerText: 'Vector search.', isCorrect: false, feedback: 'Vector search uses similarity in vectors.'}, - { answerText: 'Semantic search.', isCorrect: false, feedback: 'Semantic search looks for similarity in meaning.'}, - { answerText: 'None of the above.', isCorrect: false, feedback: 'The truth is out there.'}, - ], - }, -]; -const vectorSearchMethod = [ - { - questionText: 'How does vector search return relevant results?', - answerOptions: [ - { answerText: 'It looks for objects with the highest vector similarity.', isCorrect: true, feedback: 'Because vectors capture meaning, objects with the highest similarity are the most relevant results.'}, - { answerText: 'It looks for objects with the smallest vectors.', isCorrect: false, feedback: 'The magnitude of the vector is largely not relevant.'}, - { answerText: 'It looks for objects with the largest vectors.', isCorrect: false, feedback: 'The magnitude of the vector is largely not relevant.'}, - { answerText: 'It looks for objects with the highest character overlap.', isCorrect: false, feedback: 'This is not quite right. It is not about character overlap, but rather about meaning.'}, - ], - }, -]; diff --git a/docs/academy/py/zero_to_mvp/_106_data_import/index.md b/docs/academy/py/zero_to_mvp/_106_data_import/index.md deleted file mode 100644 index 973e4cd4b..000000000 --- a/docs/academy/py/zero_to_mvp/_106_data_import/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 106 Data Import -sidebar_position: 106 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - diff --git a/docs/academy/py/zero_to_mvp/_107_crud_operations/index.md b/docs/academy/py/zero_to_mvp/_107_crud_operations/index.md deleted file mode 100644 index e61d0199d..000000000 --- a/docs/academy/py/zero_to_mvp/_107_crud_operations/index.md +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 107 CRUD Operations -sidebar_position: 107 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - diff --git a/docs/academy/py/zero_to_mvp/_108_modules/index.mdx b/docs/academy/py/zero_to_mvp/_108_modules/index.mdx deleted file mode 100644 index 2515e437e..000000000 --- a/docs/academy/py/zero_to_mvp/_108_modules/index.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 108 Modules -sidebar_position: 108 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - diff --git a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/102_arguments.mdx b/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/102_arguments.mdx deleted file mode 100644 index b585b1406..000000000 --- a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/102_arguments.mdx +++ /dev/null @@ -1,144 +0,0 @@ ---- -title: Search filters ---- - -:::tip Nested filters -You can nest `where` filters as well by nesting each item within `operands`. Try it yourself, or take a look at this example: - -
- - Nested where filter example - - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer", "points"] -).with_limit(2).with_near_text( - {"concepts": "Intergalactic travel"} -).with_additional( - ["distance", "id"] -).with_where({ - "operator": "And", - "operands": [ - { - "path": ["question"], - "operator": "Like", - "valueText": "*rocket*" - }, - { - "operator": "Or", - "operands": [ - { - "path": ["points"], - "operator": "Equal", - "valueInt": 200 - }, - { - "path": ["points"], - "operator": "Equal", - "valueInt": 1000 - } - ] - } - ] -}).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion ( - limit: 2 - nearText: { - concepts: ["Intergalactic travel"], - } - where: { - operator: And, - operands: [ - { - path: ["question"], - operator: Like, - valueText: "*rocket*" - } - { - operator: Or, - operands: [ - { - path: ["points"], - operator: Equal, - valueInt: 200 - }, - { - path: ["points"], - operator: Equal, - valueInt: 1000 - } - ] - }, - ] - - } - ) { - question - answer - points - _additional { - distance - id - } - } - } -} -``` - - - - -
- -it should produce a result like this: - -
- See the JSON response - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "_additional": { - "distance": 0.18400955, - "id": "48fe1f8f-8e09-5aee-afe9-1728ab2fe8a0" - }, - "answer": "space shuttles", - "points": 200, - "question": "These transports, first sent up in 1981, lift off like a rocket & land like a plane" - }, - { - "_additional": { - "distance": 0.24946856, - "id": "c00decd4-4cf1-5b03-a789-a57077e082fb" - }, - "answer": "Huntsville", - "points": 1000, - "question": "A campus for the University of Alabama is here, nicknamed \"Rocket City, U.S.A.\"" - } - ] - } - } -} -``` - -
- -::: diff --git a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/_graphql.mdx b/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/_graphql.mdx deleted file mode 100644 index 4ae1a93c2..000000000 --- a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/_graphql.mdx +++ /dev/null @@ -1,159 +0,0 @@ ---- -title: A little more about GraphQL -sidebar_position: 99 ---- - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -## About GraphQL - -Image alt - -Here, we'll pause a little bit to talk about why we use GraphQL, and what to keep in mind when using GraphQL with Weaviate. - -### Query structure - -One of the key benefits of GraphQL is that it allows the user to specify the exact data that they want to retrieve. This means that the user can request only the data that they need, reducing over-fetching or under-fetching of data. - -Let's review the fundamental syntax of GraphQL in Weaviate. - -```graphql -{ - { - { - - } - } -} -``` - -You can see that a Weaviate GraphQL query contains multiple components. If you were to query a database of Wikipedia articles objects, the fields in the above query might be as follows: - -* ``: The function that you want to use, such as `Get`, `Aggregate`, or `Explore`. -* ``: The class that you want to retrieve data from, such as `WikiArticle`. -* ``: The properties that you want to retrieve, such as `title`, `wiki_summary`. - -Each field can also accept arguments, such as `limit` to limit the number of objects to be returned, or `where` to apply a filter. - -The data flow with a GraphQL query can be conceptualized like this: - -![](https://kinsta.com/wp-content/uploads/2022/09/graphql.png) - -This specificity may seem challenging at first. However, it is a powerful feature that allows you to build highly customized queries, and to retrieve only the data that you need. - -It is also highly efficient, as what may have taken multiple requests in a REST API can be done in a single GraphQL query. - -### Query syntax - -For the most part GraphQL is very flexible in terms of formatting, such that [whitespaces](https://spec.graphql.org/October2021/#sec-White-Space) and [commas](https://spec.graphql.org/October2021/#sec-Insignificant-Commas) are typically insignificant. For instance, the queries below are interpreted identically: - -This query: - -```graphql -{ - Get { - JeopardyQuestion (limit: 1) { - question - answer - } - } -} -``` - -Is the same as: - -```graphql -{Get{JeopardyQuestion(limit:1){question answer}}} -``` - -Or even (notice the comma): - -```graphql -{Get{JeopardyQuestion(limit:1){question,answer}}} -``` - -However, importantly, names in GraphQL [are case-sensitive](https://spec.graphql.org/October2021/#sec-Names), so `Get` and `JeopardyQuestion` are different from `get` and `jeopardyquestion`. - - - - - -## Review - -Image alt - -Each section should have a quick recap/review video of about a minute. - -### Review exercise - -**Aspirational section.** Would like to have the user provide a freeform input, and get back a similarity score and our definition. - -:::info TODO -Can you describe, in your own sentence, XXX? - -Input box -> vectorizer -> compare to model answer. -::: - -### Key takeaways - -:::info -Add summary -::: - -import Quiz from '/src/components/Academy/quiz.js' -const graphQLSyntax = [{ - questionText: 'Which of the following is true about GraphQL?', - answerOptions: [ - { - answerText: 'It is not case-sensitive.', - isCorrect: false, - feedback: 'GraphQL is case sensitive.', - }, - { - answerText: 'Whitespaces are typically ignored.', - isCorrect: true, - feedback: 'This is also the case with commas.', - }, - { - answerText: 'GraphQL always returns the same fields in the response.', - isCorrect: false, - feedback: 'As you have seen in our Get queries previously, you must specify the fields to be returned in GraphQL.', - }, - ] -}]; diff --git a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/_leftovers.mdx b/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/_leftovers.mdx deleted file mode 100644 index defa4f335..000000000 --- a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/_leftovers.mdx +++ /dev/null @@ -1,531 +0,0 @@ -## 102 - ---- -title: Introduction to Queries -sidebar_position: 10 ---- - -import imageUrl from '../../tmp_images/academy_placeholder.jpg'; - -## Running Weaviate queries - -Image alt - -In this unit, we will focus on various strategies to query Weaviate to retrieve information. - -### With the Python client - -Although Weaviate uses GraphQL under the hood for queries, you will not need to directly enter GraphQL in this unit. Instead, we will be using the Python client to perform and learn about queries. - -But note that we also show raw GraphQL queries as well as the Python code in this unit to help learn the underlying query structures. - -So please be aware that when we show a query such as this: - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_near_text( - {"concepts": ["intergalactic travel"]} -).with_limit(2).do() - -print(json.dumps(response, indent=2)) -``` - -This is what happens under the hood: - -```graphql -{ - Get { - JeopardyQuestion ( - nearText: { - concepts: ["intergalactic travel"] - } - limit: 2 - ) { - question - answer - } - } -} -``` - -We will show them in separate tabs going forward where applicable, like so: - - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -This tab will show Python code. -``` - - - - -```graphql -This tab will show GraphQL code. -``` - - - - -### Raw GraphQL queries - -The Weaviate Python client can run raw GraphQL queries. You can also use the [Weaviate Query app](/cloud/tools/query-tool). - -In Python, you can run a GraphQL query directly with: - -```python -query = ''' -{ - Get { - JeopardyQuestion ( - nearText: { - concepts: ["intergalactic travel"] - } - limit: 2 - ) { - question - answer - } - } -} -''' - -response = client.query.raw(query) -print(json.dumps(response, indent=2)) -``` - -Or, the Weaviate Console provides a graphical interface that you can connect to your Weaviate instance and run queries. - - - -### GraphQL syntax - -You will see terms such as `Function`, `Class`, `properties` and `Argument` in this units. They come from Weaviate's GraphQL syntax, which takes this form: - -```graphql -{ - { - () { - - } - } -} -``` - -Where: -- `` is the action, -- `` is the target data collection, -- `` specifies any options, and -- `` describes the data to be retrieved. - -So keep these terms in mind. - - - -## Query functions - -Image alt - -The primary query functions in Weaviate are `Get`, `Aggregate` or `Explore`. - -These three functions are used to `Get` objects, `Aggregate` information and `Explore` vector spaces. - -Let's review each one briefly, before we learn about them in detail in the following sections. - -### `Get` objects - -The `Get` function is used in Weaviate to retrieve data objects. For many use cases, `Get` will be the most common type of query function used in Weaviate. - -Take a look at the following: - - - - -```python -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_limit(1).do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Get { - JeopardyQuestion (limit: 1) { - question - answer - } - } -} -``` - - - -This is a `Get` query to `JeopardyQuestion` class, with a limit of maximum `1` returned object, with the object's `["question", "answer"]` properties. - -It should produce a response like the following: - -```json -{ - "data": { - "Get": { - "JeopardyQuestion": [ - { - "answer": "Amazon.com", - "question": "On July 16, 1995 this company made its first sale, a science textbook" - } - ] - } - } -} -``` - -You can see that the response includes a data object from the `JeopardyQuestion` class, and values of its `question` and `answer` properties. - - -================= - -Notice the presence of the `groupBy` argument. - -`groupBy` is an optional argument that can be used to group results of the query. The `Aggregate` function can also return metadata about the results, such as the number of results in each group. - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; - - - - -```python -response = client.query.aggregate( - , -).with_group_by_filter( - -).with_fields( - "groupedBy {path value}" # requires `groupBy` argument - -).with_meta_count( # optional -)..do() -``` - - - - -```graphql -{ - Aggregate (groupBy:[]) { # `groupBy`: optional argument - groupedBy { # requires `groupBy` argument - path - value - } - meta { # optional - count - } - ( - - ) { - - } - } -} -``` - - - - -### `Aggregate` information - -Whereas `Get` returns objects, `Aggregate` returns summary statistics or aggregates from the result set. - -As such you can use `Aggregate` to obtain summary values such as counts, sums, means and so on from groups of objects. Take a look at the following: - - - - -```python -response = client.query.aggregate( - "JeopardyQuestion", -).with_near_text( - {"concepts": ["Animals in movies"]} -).with_object_limit( - 10 -).with_meta_count().with_fields("value {maximum minimum mean}").do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Aggregate { - JeopardyQuestion ( - objectLimit: 10 - nearText: { - concepts: ["Animals in movies"] - } - ) { - meta { - count - } - points { - maximum - minimum - mean - } - } - } -} -``` - - - - -The previous query will grab 10 closest objects to the query text in `JeopardyQuestion` class, and return the maximum, minimum and mean `value` as well as the count. - -It should produce a response like the following: - -```json -{ - "data": { - "Aggregate": { - "JeopardyQuestion": [ - { - "meta": { - "count": 10 - }, - "points": { - "maximum": 1000, - "mean": 320, - "minimum": 0 - } - } - ] - } - } -} -``` - -Notice that even though the query related to a maximum of 10 objects, the result was *one* set of aggregated values. - -### `Explore` vector spaces - -`Explore` allows you to navigate the entire vector space of objects stored in Weaviate. - -This means that you can use `Explore` to find objects that are similar to a given object or vector, regardless of the class of the object. As such, `Explore` is very useful when you may not be sure of what class to search for. - -In relation to our dataset, a query using the `Explore` function would return a set of objects from both the `JeopardyQuestion` and `Category` classes. - -We will explore the `Explore` function in more detail in a separate, following unit. - - - -## Review - -Image alt - -### Review exercise - -Can you describe, in your own sentence, XXX? - -:::warning TODO -Input box for user to put answer in and get back a similarity score & our definition? -?? -::: - - - -## Sub-properties for `Aggregate` - -As we have seen, the `meta` property is available for all data types, and can be used with the `count` sub-property to return the number of retrieved objects. - -In addition to this, there are a number of other sub-properties that can be used with `Aggregate` queries. Their availability, however, depend on the data type that is being queried. - -Let's take a look at some of these sub-properties, based on the `Question` class. - -### Example - -Try the following query: - - - - -```python -response = client.query.aggregate( - "JeopardyQuestion", -).with_fields( - "round {type topOccurrences {value occurs}}" -).with_near_text( - {"concepts": ["Intergalactic travel"]} -).with_object_limit(10).with_meta_count().do() - -print(json.dumps(response, indent=2)) -``` - - - - -```graphql -{ - Aggregate { - JeopardyQuestion ( - nearText: { - concepts: ["Intergalactic travel"], - distance: 0.2 - } - ) { - meta { - count - } - round { - type - topOccurrences { - value - occurs - } - } - } - } -} -``` - - - - -Take a note of the requested properties in the GraphQL query, and consider what types of values they might return. How might these properties change according to the nature of the source data? - -Now, try it out yourself. The query should return something like this: - -
- See the JSON response - -```json -{ - "data": { - "Aggregate": { - "JeopardyQuestion": [ - { - "meta": { - "count": 10 - }, - "round": { - "topOccurrences": [ - { - "occurs": 5, - "value": "Double Jeopardy!" - }, - { - "occurs": 4, - "value": "Jeopardy!" - }, - { - "occurs": 1, - "value": "Final Jeopardy!" - } - ], - "type": "text" - } - } - ] - } - } -} -``` - -
- -
- Explain this query - -The sub-properties under `round` in the query requests additional aggregations here, resulting in the top occurring answers being returned as well as their counts. - -
- -### Available sub-properties - -Sub-properties allow further granular aggregations of data from Weaviate. These properties that can be queried will vary according to the data type that is being aggregated. - -Along with the `string` data type, `text` properties may be queried to retrieve one or more of: - -- count -- type -- topOccurrences - -However, `int` data types, for example, can be aggregated to retrieve: - -- count -- type -- minimum -- maximum -- mean -- median -- mode -- sum - -:::note Exercise -Try out the above query again, with these changes. -- Can you aggregate the results based on the data from the `answer` property? -- Try modifying the query to group the results by `round`, and then aggregate top occurrences from `answer`. -::: - -:::info -Add summary -::: - -import Quiz from '/src/components/Academy/quiz.js' -const queryMethods = [{ - questionText: 'Which of these is not a valid way of running queries with Weaviate?', - answerOptions: [ - { - answerText: 'Sending a HTTP GET request.', - isCorrect: true, - feedback: 'In Weaviate, the REST interface is not used for queries.', - }, - { - answerText: 'Using the Weaviate Console.', - isCorrect: false, - feedback: 'You can use the Weaviate Console to directly enter GraphQL queries', - }, - { - answerText: 'With the Weaviate Python client.', - isCorrect: false, - feedback: 'In fact, you can send raw GraphQL queries or use native Python methods to perform queries with the Python client.', - }, - ] -}]; -const functionExplanations = [{ - questionText: 'Which of the following are correct?', - answerOptions: [ - { - answerText: 'You can use the Get function to retrieve summary information about a group of objects.', - isCorrect: false, - feedback: 'The Get function is used to retrieve individual objects.', - }, - { - answerText: 'The Aggregate function will return objects from Weaviate.', - isCorrect: false, - feedback: 'The Aggregate function will return summary, or aggregated, information about retrieved objects.', - }, - { - answerText: 'The Get function can retrieve objects from multiple classes in one query.', - isCorrect: false, - feedback: 'Each Get query can only search one class of objects.', - }, - { - answerText: 'None of the above.', - isCorrect: true, - feedback: 'All of the above are false!', - }, - ] -}]; \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/tmp.mdx b/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/tmp.mdx deleted file mode 100644 index 55e72ac65..000000000 --- a/docs/academy/py/zero_to_mvp/_109_tmp_leftovers/tmp.mdx +++ /dev/null @@ -1,77 +0,0 @@ -# Leftover information - -### `Get` with cross-references - -And before we wrap up this section, let's take a look at how to perform a `Get` query with cross-references. While we have not yet covered cross-references in detail in this unit, for now it is sufficient to know that they are a way to connect objects to each other. - -In the example below, we will perform a `Get` query on the `Question` class, but this time we will include the `hasCategory` property in the query. This property is a cross-reference to the `Category` class, and will be used to return the category of each question. - -```graphql -{ - Get { - Question ( - limit: 1 - nearText: { - concepts: ["Seafaring technology from Scandanavia."], - } - ) { - question - answer - hasCategory { - ... on Category { - title - _additional { - id - } - } - } - _additional { - distance - id - } - } - } -} -``` - -This produces a result like the following: - -
- Query results - -```json -{ - "data": { - "Get": { - "Question": [ - { - "_additional": { - "distance": 0.14927602, - "id": "5d9d5c8d-13e7-5791-b2f9-3724a1a1d301" - }, - "answer": "Oslo", - "hasCategory": [ - { - "_additional": { - "id": "fec50326-dfa1-53c9-90e8-63d0240bd933" - }, - "title": "MUSEUMS" - } - ], - "question": "1,200-year-old Viking ships are on display at the Vikingskiphuset in this Norwegian capital" - } - ] - } - } -} -``` - -
- -Where the `hasCategory` property is now an array of objects, containing objects of type `Category` that the Question is related to and their requested properties. The `title` property of the `Category` class is returned as well, and the `_additional` field contains the `id` of the category. - -Cross-references provide you with a great deal of flexibility and power in your data model, and are a great way to connect objects to each other. We will cover them in more detail in our unit on schemas. - -:::info `...` pattern -You may have noticed the `... on Category` pattern in the above query. This syntax is called a "fragment" in GraphQL terms. Populated this way, the fragment specifies that the `hasCategory` property is of type `Category`, and that the results should be returned as such. -::: diff --git a/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/combining_models.md b/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/combining_models.md deleted file mode 100644 index d4b1d11c3..000000000 --- a/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/combining_models.md +++ /dev/null @@ -1,12 +0,0 @@ ---- -title: Combining models -sidebar_position: 3 ---- - -Inspired by discussions here https://weaviate.slack.com/archives/C017EG2SL3H/p1677534406507339 - -- How to combine models for things like Generative search + vectorizer - -- CLIP + Q&a - -etc etc \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/common_model_types.md b/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/common_model_types.md deleted file mode 100644 index 3aa1c8070..000000000 --- a/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/common_model_types.md +++ /dev/null @@ -1,100 +0,0 @@ ---- -title: Common model types -sidebar_position: 2 ---- - -## Types of vector representations - -:::warning TODO -Intro video here -::: - -When we talk about vectors, we are typically referring to vectors that are derived by machine-learning models. More specifically, we refer to vectors that are derived from neural networks, called "dense" vectors. - -However, there are other vector representations that are used to represent meaning, especially in relation to textual meaning. They include: - -- One-hot encoding -- TF-IDF (term frequency-inverse document frequency) vectors -- BM25 vectors - -Let's take a brief look at each one, as well as dense vectors. - -### One-hot encoding - -One-hot encodings represent text as a collection of 0s and 1s, where each 1 represents the presence of a word in the text. Sometimes this is also referred to as a "bag of words" representation. - -Accordingly, this representation ends up being very sparse, with most of the vector being 0s. This is because most words are not present in a given text. - -A limitation of this method is that it is not able to capture similarity of words, as each word is simply represented as being present or not present. Additionally, it is not able to take into account the relative importance of words in a text. - -### TF-IDF vectors - -A TF-IDF representation improves on the one-hot encoding by taking into account the relative importance of words in a text. - -TF-IDF stands for "term frequency-inverse document frequency". It is a numerical statistic that is intended to reflect how important a word is to a document in a collection or corpus. - -The TF-IDF value increases proportionally to the number of times a word appears in the document, but is offset by the frequency of the word in the corpus. This means that common words across all documents such as "the", "a", "is" and "are" are penalized, whereas words that are rare across all documents are given more weight. - -Intuitively, this means that TF-IDF is able to capture the relative importance of words in a text by weighting rare words more heavily. - -### BM25 vectors - -BM25 vectors are similar to TF-IDF vectors, but they are able to take into account the length of a document. This is important because longer documents will have more words, and therefore a higher TF-IDF score, even if they are not more relevant than shorter documents. - -Accordingly, BM25 vectors normalize the TF-IDF score by the length of the document. - -### Dense Vectors - -#### Word vectors - -Word vectors are a type of vector representation that is derived from neural networks. They are able to capture the meaning of words by learning the context in which they appear. - -Models such as "word2vec" and "GloVe" popularized this type of representational learning. One key shortcoming of word-based vectors is that they are not able to take into account local context, such as the context of a word as a part of its parent sentence. - -This meant that where words need to be disambiguated, such as in the case of homonyms, word vectors were not able to capture the meaning of the word in the context of the sentence. (For example, the word "bank" can mean a financial institution or a river bank.) - -:::note Word vectors + weighting -Word vectors in a text can be combined with a weighting method such as TF-IDF or BM25 to capture the relative importance of words in the text. The resulting vector can be used to represent the entire text. -::: - -#### Transformer-derived vectors - -Most modern vector database use vectors that are derived from what are called "transformer" models. - -Transformers are a type of neural network that are able to take into account the context of its parent sentence in determining the meaning of each word. This means that they are able to disambiguate words that have multiple meanings, such as the word "bank" in the example above. - -Their current key shortcoming is their resource-intensive nature, especially as the input size (e.g. text length) increases. - - - - - -## By media type - -## Text vectorizer types - -## Multi-media vectorizers - -import Quiz from '/src/components/Academy/quiz.js' -const sparseOrDense = [ - { - questionText: 'From the folloowing, select the correct statement about sparse and dense vectors.', - answerOptions: [ - { answerText: 'One-hot encoding & word vectors: sparse, Transformer-derived: dense.', isCorrect: false, feedback: 'Word vectors are not sparse.'}, - { answerText: 'Document vector generated from BM25-weighted word vectors: sparse.', isCorrect: false, feedback: 'Word vectors are dense. Accordingly, a document vector generated by weighting BM25 scores are also dense.'}, - { answerText: 'One-hot encoding: sparse, TF-IDF based bag of words: dense.', isCorrect: false, feedback: 'Bag-of-words vectors are sparse. Accordingly, a vector that is based on TF-IDF weighting is also sparse.'}, - { answerText: 'One-hot encoding: sparse, Word vectors & transformer-derived: dense.', isCorrect: true, feedback: 'This is the only correct answer.'}, - ], - }, -]; -const wordVecVsTransformer = [ - { - questionText: 'Select the correct statement.', - answerOptions: [ - { answerText: 'One-hot encoding & word vectors: sparse, Transformer-derived: dense.', isCorrect: false, feedback: 'Word vectors are not sparse.'}, - { answerText: 'Document vector generated from BM25-weighted word vectors: sparse.', isCorrect: false, feedback: 'Word vectors are dense. Accordingly, a document vector generated by weighting BM25 scores are also dense.'}, - { answerText: 'One-hot encoding: sparse, TF-IDF based bag of words: dense.', isCorrect: false, feedback: 'Bag-of-words vectors are sparse. Accordingly, a vector that is based on TF-IDF weighting is also sparse.'}, - { answerText: 'One-hot encoding: sparse, Word vectors & transformer-derived: dense.', isCorrect: true, feedback: 'This is the only correct answer.'}, - ], - }, -]; \ No newline at end of file diff --git a/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/index.mdx b/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/index.mdx deleted file mode 100644 index f3166c7cb..000000000 --- a/docs/academy/py/zero_to_mvp/_999_vectorizer_selection_1/index.mdx +++ /dev/null @@ -1,8 +0,0 @@ ---- -title: 105 Vectorizer Selection 1 -sidebar_position: 105 ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - diff --git a/docs/academy/py/zero_to_mvp/_snippets/deprecation.md b/docs/academy/py/zero_to_mvp/_snippets/deprecation.md deleted file mode 100644 index 894e4cbac..000000000 --- a/docs/academy/py/zero_to_mvp/_snippets/deprecation.md +++ /dev/null @@ -1,5 +0,0 @@ -:::info -This course was written for the Weaviate Python client API (`v3`), and is now deprecated. - -If you are new to Weaviate, we recommend you start with one of the 100-level courses written with the `v4` client API, such as those for working with [text data](/academy/py/starter_text_data/index.md), [your own vectors](/academy/py/starter_custom_vectors/index.md), or [multimodal data](/academy/py/starter_multimodal_data/index.md). -::: diff --git a/docs/academy/py/zero_to_mvp/_snippets/setup.py b/docs/academy/py/zero_to_mvp/_snippets/setup.py deleted file mode 100644 index 5dbd6853b..000000000 --- a/docs/academy/py/zero_to_mvp/_snippets/setup.py +++ /dev/null @@ -1,87 +0,0 @@ -# ===== Basic instantiation ===== -import weaviate -import os - -client = weaviate.Client( - url="https://edu-demo.weaviate.network", - auth_client_secret=weaviate.auth.AuthApiKey(api_key="learn-weaviate"), # A read-only API Key for the Weaviate instance -) - -print(client.is_ready()) # This should return `True` -# ===== END Basic instantiation ===== -assert client.is_ready() - -# ===== Instantiate to edu-demo with OpenAI ===== -import weaviate - -client = weaviate.Client( - url="https://edu-demo.weaviate.network", - auth_client_secret=weaviate.auth.AuthApiKey(api_key="learn-weaviate"), # A read-only API Key for the Weaviate instance - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY" # Replace with your OPENAI API key - } -) -# ===== END Instantiate to edu-demo with OpenAI ===== -assert client.is_ready() - -# ===== Arbitrary code example ===== -import json - -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_near_text( - {"concepts": ["intergalactic travel"]} -).with_limit(2).do() - -print(json.dumps(response, indent=2)) -# ===== END Arbitrary code example ===== -assert len(response["data"]["Get"]["JeopardyQuestion"][0]) == 2 - -# ===== Fuller arbitrary code example ===== -import weaviate -import json - -client = weaviate.Client( - url="https://edu-demo.weaviate.network", - auth_client_secret=weaviate.auth.AuthApiKey(api_key="learn-weaviate"), # A read-only API Key for the Weaviate instance - additional_headers={ - "X-OpenAI-Api-Key": "YOUR-OPENAI-API-KEY" # Replace with your OPENAI API key - } -) - -response = client.query.get( - "JeopardyQuestion", - ["question", "answer"] -).with_near_text( - {"concepts": ["intergalactic travel"]} -).with_limit(2).do() - -print(json.dumps(response, indent=2)) -# ===== END Fuller arbitrary code example ===== -assert len(response["data"]["Get"]["JeopardyQuestion"][0]) == 2 - -# ===== GraphQL example with Python execution ===== -gql_query = """ -# ===== Equivalent GraphQL example ===== -{ - Get { - JeopardyQuestion ( - nearText: { - concepts: ["intergalactic travel"] - } - limit: 2 - ) { - question - answer - } - } -} -# ===== END Equivalent GraphQL example ===== -""" - -gql_response = client.query.raw(gql_query) -print(json.dumps(gql_response, indent=2)) -# ===== END GraphQL example with Python execution ===== - -assert response == gql_response diff --git a/docs/academy/py/zero_to_mvp/index.md b/docs/academy/py/zero_to_mvp/index.md deleted file mode 100644 index bb484e3ef..000000000 --- a/docs/academy/py/zero_to_mvp/index.md +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: (v3) Zero to MVP -description: Build a Weaviate MVP from scratch, covering setup, data queries, and schema. ---- - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; -import CourseUnits from '/src/components/Academy/courseUnits.jsx'; -import { courseData } from '/src/components/Academy/courseData.js' - -## Course overview - -import ZeroToMvpDeprecationNote from '/docs/academy/py/zero_to_mvp/_snippets/deprecation.md'; - - - -This course is designed to get you started with Weaviate, so that you can go from being new to Weaviate to building an MVP-level product with Weaviate in a short period of time. - -Along the way, you'll develop intuitions about not only how Weaviate works, but also how vectors work, and how vector searches work. You'll also learn how to use Weaviate's client library so that you can get going in a language that you are familiar with. - -By the time you're done with these short units, you'll be able to build your own instance of Weaviate with your own data, and have a suite of search tools at your disposal so that you can get the data you want in the format you want it. - -## Learning objectives - - - -## Units - - diff --git a/docs/academy/py/zero_to_mvp/setup.mdx b/docs/academy/py/zero_to_mvp/setup.mdx deleted file mode 100644 index 21267ab29..000000000 --- a/docs/academy/py/zero_to_mvp/setup.mdx +++ /dev/null @@ -1,192 +0,0 @@ ---- -title: P3_101A Weaviate Academy Preparation -description: Set up Weaviate from scratch and start building your first project. -sidebar_position: 101.5 ---- - -import Tabs from '@theme/Tabs'; -import TabItem from '@theme/TabItem'; -import PythonCodeExample from '!!raw-loader!./_snippets/setup.py'; -import FilteredTextBlock from '@site/src/components/Documentation/FilteredTextBlock'; - -import ZeroToMvpDeprecationNote from '/docs/academy/py/zero_to_mvp/_snippets/deprecation.md'; - - - -## Overview - -Follow this short guide to make sure that you are set up to use Weaviate with the Python client. - - - -## Python & Python client - -If you have not yet set up Python and the Weaviate Python client, follow the instructions below. - -### Install Python - -You can install Python 3 in a variety of ways. One easy way is to use an appropriate installer for your system as per instructions on [Python.org](https://www.python.org/downloads/). - -### Install the Weaviate client - -:::tip Virtual environments -It is best practice to use virtual environments to isolate projects. - -If you're not familiar with virtual environments, we highly recommend reading up on them - this tutorial on [FreeCodeCamp](https://www.freecodecamp.org/news/how-to-setup-virtual-environments-in-python/) is a good resource, as is this article on [RealPython](https://realpython.com/python-virtual-environments-a-primer/), which goes a little more in-depth. -::: - -You will also need the following libraries: - -``` -weaviate-client -``` - -Activate your virtual environment, and install the Weaviate client with: - -```shell -pip install weaviate-client -``` - -### Running your Python code - -We recommend running Python code in a Jupyter notebook. You can install the library by running - -```shell -pip install notebook -``` - -Then running the shown code in a Jupyter notebook. Of course, you can also run it in any way that you prefer. - -:::tip New to Jupyter? -To learn how to use Jupyter notebooks, [try this tutorial](https://realpython.com/jupyter-notebook-introduction/). -::: - -## Weaviate - -### Instance access - -We will be accessing a pre-built instance of Weaviate throughout the course, located at `https://edu-demo.weaviate.network`. - -You can connect to the pre-built instance with a read-only Weaviate API key. - -Confirm that you can run the following code in your Python environment. - - - -If everything works, the code returns `True`. - -### Inference API Key - -We will be using OpenAI's API, so you will need an API key. - -If you haven't yet, create an OpenAI account, and create a [free API key here](https://platform.openai.com/api-keys). - -Then, instantiate the client as shown below, replacing `YOUR-OPENAI-API-KEY` with your own API key for OpenAI. - - - -## Code examples - -### Client instantiation - -For brevity, many code examples in the Weaviate Academy will not show the instantiation of the client. So, where you see examples such as: - - - -You will need to instantiate the client as shown in the previous section. So, the above code would become: - - - -### GraphQL - -Weaviate Academy units are written to be used with a client library such as Python. So, you will not need to run GraphQL or REST queries directly. - -But note that we also show raw GraphQL/REST queries as well as the Python code where relevant to help you learn the underlying query structures. - -So please be aware that when we show a query such as this: - - - -This is what happens under the hood: - - - -We will show them in separate tabs going forward where applicable, like so: - - - - -```python -This tab will show Python code. -``` - - - - -```graphql -This tab will show GraphQL code. -``` - - - - -### Raw GraphQL queries - -The Weaviate Python client can run raw GraphQL queries. You can also use the [Weaviate Query app](/cloud/tools/query-tool). - -In Python, you can run a GraphQL query directly with: - - - - - - - -### Key takeaways - -- You have installed the Weaviate client library in your preferred environment. -- You have an OpenAI inference key. -- You have some way of running Python code. -- You know how to instantiate the Weaviate Python client, and run the shown example code. -- The Academy units will show Python code, but also raw GraphQL or REST snippets where applicable. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/10_introduction.mdx b/docs/academy/theory/010_ai_models_deep_dive/10_introduction.mdx deleted file mode 100644 index 8a91de427..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/10_introduction.mdx +++ /dev/null @@ -1,99 +0,0 @@ ---- -title: Overview -description: An overview of AI models ---- - -## Examples of AI model use - -We know that AI models can tackle tasks that were previously seen as exclusively human activities. Let's take a few concrete examples. - -### Image generation - -AI models can generate images matching your text description, according to your imagination. - -
- -| Prompt | Generated image | -| --- | --- | -| `a sleek looking robot cowboy riding on a big, cute, cartoon unicorn, on the moon.` | ![a sleek looking robot cowboy riding on a big, cute, cartoon unicorn, on the moon.](./_img/10_generated_unicorn.png) | - -
Image generated by DALL-E, an AI model by OpenAI
-
- -### Language understanding - -They can also engage in conversations with us on matters big and small: - -
- -| A technical conversation with AI | A casual conversation with AI | -| --- | --- | -| ![Conversation with Claude about AI models](./_img/10_chat_example_1.png) | ![Conversation with Claude about unicorns](./_img/10_chat_example_2.png) | - -
Conversations with claude, an AI model by Anthropic
-
- -### Code completion - -In fact, many of us already use them as a companion in our day-to-day workflow, even with tasks as complex as coding. - -
- -![Copilot examples](./_img/10_copilot_example.png) - -
Using Microsoft Copilot in VSCode
-
- -### Table: Types of AI models - -Here’s a broader overview of AI models, summarizing their inputs and output types, with concrete examples. - -
- -| Type | Inputs | Output format | Output significance | Example models / use | Example input | Example output | -| --- | --- | --- | --- | --- | --- | --- | -| Large (or vision) language model | Text (or text + image) | Text | Answers the input text | claude-3-7-sonnet, gemini-2.0 | Where is the capital of France? | The capital of France is Paris | -| Image generation model | Text | Image | Image of the input | Stable diffusion, Dall-E | Image of a cute red dog | ![Image of a dog](./_img/10_cute_dog.png) | -| Embedding model | Text | Array of numbers | Capture semantic meaning | Word2vec, sentence-transformers | Image of a cute red dog | [0.0149, 0.8932, …, 0.0589] | -| Image classifier | Image | Array of numbers | Likelihood of category | ResNet, disease detection models | ![Image of a dog](./_img/10_cute_dog.png) | [0.958, 0.042] | -| Regression model | Numbers | Number | Numerical prediction | Housing price prediction, Electricity demand forecasting | [4, 950, 1985, 2010, 10011, 0] | 2585913 | - -
Overview of AI models
-
- -## Do AI models think like humans? - -In this day and age, AI models indeed can perform all these amazing tasks (and more), many at a human level quality, and at breakneck speed. But AI models are very different to humans in one key way. - -AI *loves* statistics. Or, more accurately, it’s powered entirely by the magic of statistics. - -At its core, AI models rely on statistics to turn these different types of inputs into outputs. Before we discuss any of the internals, it may be useful to think of AI models’ jobs as doing (incredibly complex) pattern-matching. This is why some have even described large language models as [stochastic parrots](https://en.wikipedia.org/wiki/Stochastic_parrot). - -But there’s no doubt that they are very impressive tools. And to get the most out of these models, some understanding of their internals will be very useful. - -## What is an "AI model"? - -Once you begin to navigate the world of AI models, you will come across more than a few different jargon. Even at the very top level, you will hear about terms like “machine learning”, “deep learning” and “neural network”, as well as “AI models” which we’ve been using. - -So - what are the differences between these? Here’s one way to look at these terms: - -- **Artificial Intelligence (AI)** is a broad term, encompassing any technology that enables machines to mimic human intelligence. -- **Machine Learning (ML)** is a specific technique to build AI systems. ML systems are not rule-based. Instead, they learn from data to identify patterns which derives its predictive capabilities. -- **Deep Learning (DL)** is a subset of machine learning based on artificial neural networks with multiple (hence "deep") layers. Most recent AI models in areas like computer vision and natural language processing were driven by deep learning models. -- **Neural Networks (NN)** describes the architecture of deep learning models. They are made up of layers of interconnected nodes (neurons) that process information through the network, before producing an answer. - -As you can see, they vary in terms of technical specificity as well as depth. For our course, we will use “AI model” as an umbrella term for our course. Think of ML, DL and NN as specific tools with which AI functionality is provided. - -In our context, the most common AI model types to use are generative models and embedding models. Generative models help us to analyze and transform information, and embedding models help us to retrieve the best contextual information to analyze and transform. (As a note, these models will almost always be deep learning models.) - -Don’t let this possible simplicity fool you though. There is a lot of nuance even within these model types, and a lot of potential pitfalls as builders. - -This also means a lot of opportunity to get it right, and get them to work for you and your tasks. Soon, we will take a look at these model types (and some others that you may come across). - -But, there is also a lot that these models have in common. Let’s take a look at some of those aspects. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/20_inside_ai_models.mdx b/docs/academy/theory/010_ai_models_deep_dive/20_inside_ai_models.mdx deleted file mode 100644 index b86ef2ec8..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/20_inside_ai_models.mdx +++ /dev/null @@ -1,143 +0,0 @@ ---- -title: AI models in general -description: A look inside AI models ---- - -import ThemedImage from '@theme/ThemedImage'; - -A big percentage of modern AI models are deep learning models. Many generative and embedding models fit into this category, as do many regression models or classification models. - -So, let’s take a look at some of their common features. - -## Deep learning models - -You may have seen a visual representation of a deep learning model like this. - - - -import NNBasicLight from './_img/nn_explained_01_basic.png'; -import NNBasicDark from './_img/nn_explained_01_basic_dark.png'; - - - -This might not look like much, but it’s actually quite an accurate representation of a deep learning model. Let’s dive into the details. - -If we were to add details to this diagram, it may look like this. The model takes some input (array `[x1, x2, x3]` in this example) and transforms it to an output (array `[y1, y2, y3]` in this example). - -import NNBasicLabellingLight from './_img/nn_explained_05_basic_labelling.png'; -import NNBasicLabellingDark from './_img/nn_explained_05_basic_labelling_dark.png'; - - - -This transformation is carried out by a set of “nodes”, typically called the “hidden layers”. - -## Inputs and outputs - -We mentioned above that the inputs and outputs can be in a variety of formats, and represent a range of different aspects. And that’s exactly the case here. - -While the example shows an array of three values as an input and the output, the inputs and outputs can comprise any length of values. - -A regression model may take inputs such as `[, , , , , , , ...]` and output a value in dollars. A classification model, on the other hand, might take in the same inputs but instead output an array of probabilities like `[0.05, 0.65, 0.20, 0.10]`, representing the likelihood of the house falling into different price brackets such as "budget," "mid-range," "luxury," or "premium luxury." - -import NNInputsOutputsLight from './_img/nn_explained_10_inputs_and_outputs.png'; -import NNInputsOutputsDark from './_img/nn_explained_10_inputs_and_outputs_dark.png'; - - - -One restriction of these models is that both inputs and outputs must be numerical. This is so that these inputs can be transformed into outputs. (We will discuss how to represent objects such as texts, or image inputs as numbers.) - -So, how do these models convert inputs into outputs, exactly? As it turns out, through a series of calculations such as those shown below. This diagram shows how the value of a particular node (`h11`, for node on row 1, column 1) is calculated. - -import NNNodeCalcsLight from './_img/nn_explained_20_node_calculation.png'; -import NNNodeCalcsDark from './_img/nn_explained_20_node_calculation_dark.png'; - - - -As you can see, the calculation is relatively simple. Some form of linear algebra is performed to calculate the value of the node from values of the connected nodes from the preceding layer. This is where the “weights” and “biases” are applied in carrying the values forward. - -Then, an “activation function” is applied to introduce non-linearity, allowing the network to learn complex patterns. This enables AI models to capture the sophisticated relationships in data that would not be possible only with combinations of linear layers. (In fact, without activation functions, the additional layers will simply be wasted!) - -And that’s really all a deep learning model does. Except - it does it at scale. - -## Parameters - -As you can see here, each node’s value is determined by the previous layer, and its weights and biases. Each of these weights and biases is called a parameter. Sum up all the parameters in a model, and we have the model’s parameter count, which is used as a shorthand for the model’s size. - -import NNTotalParamsLight from './_img/nn_explained_25_total_parameters.png'; -import NNTotalParamsDark from './_img/nn_explained_25_total_parameters_dark.png'; - - - -Large AI models have billions of these. The largest version of Meta’s Llama3 models has 405 billion parameters, and the original GPT-4 model was rumored to have ~1.8 trillion parameters. - -And that is how a model turns a bunch of inputs into outputs under-the-hood. - -import NNFinalOutputLight from './_img/nn_explained_30_final_output.png'; -import NNFinalOutputDark from './_img/nn_explained_30_final_output_dark.png'; - - - -As it turns out, once a model is given enough parameters, these parameters can be tuned to imbue these models with incredible power. - -:::info So, are larger models always better? - -Not quite. Even if you weren’t building and training these models, using, larger models involve overhead. Larger models require more resources (memory) to run, slower to generate outputs, and are therefore more expensive. While larger models may be more capable on average, it may be possible to find smaller models that work just as well for a given purpose. -
- -We will discuss this a lot more in a later section related to AI model selection. - -::: - -Now, it may seem somewhat intuitive that a model can take a series of numbers describing features of a house, and predict its value (regression) or the probabilities of it belonging to a particular class. - -But how does this concept lead to generative AI models such as `gpt` models, or embedding models? It’s actually more similar than you might think. - -Let’s take a look at this in the next section. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/30_generative_models.mdx b/docs/academy/theory/010_ai_models_deep_dive/30_generative_models.mdx deleted file mode 100644 index 4694b100f..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/30_generative_models.mdx +++ /dev/null @@ -1,134 +0,0 @@ ---- -title: Generative AI models -description: A look inside generative AI models ---- - -import ThemedImage from '@theme/ThemedImage'; - -:::info Generative models and Weaviate - -Weaviate integrates with many generative models so that they can be used conveniently in conjunction with the stored data, such as for generating summaries from retrieved data. In many ways, much of the details you see in this module are abstracted away when you use Weaviate. - -::: - -Many of us may use AI models through a chat-type application such as `Claude.ai`, `ChatGPT` or `Meta AI`. These applications are also powered by deep learning models. Models such as `claude-3.5`, `gpt-4` and `llama-3` models respectively. - -This type of model is generally called “generative AI”, or a “generative model”. - -## How generative AI models work - -In the previous section, we saw how deep learning models work. Here is a reminder: - -import NNInputsOutputsLight from './_img/nn_explained_10_inputs_and_outputs.png'; -import NNInputsOutputsDark from './_img/nn_explained_10_inputs_and_outputs_dark.png'; - - - -Deep learning models take a numerical input, transform it through its “hidden layers”, and produce a numerical output. But generative models turn input texts into output texts - seemingly a very different task. - -There are two very important factors that allow generative models to do this. One is tokenization, and the other is auto-regressive generation. - -### Tokenization - -Inputs to deep learning models are numerical while generative models take text inputs. Tokenization is the first step in converting text into a series of numerical values. - -Take an input such as `Explain tokenization like I am five.`. A generative model may “tokenize” this input like so: - -import Tokenization from './_img/tokenization.png'; - -Neural Network Node Calculations - -Each part is a unique “token”, which can be thought as a minimum unit of understanding for the model. The model can then replace each token with a unique integer. Once it adds a special token at the start and the end of the input, that sequence becomes the model’s input. The input might be: - -`[200264, 176289, 6602, 2860, 1299, 357, 939, 6468, 13, 200265]` - -Keep in mind that each model will have its own tokenizer that behaves differently. - -### Auto-regressive generation - -At the opposite end, a generative model is configured to output one token. The architecture of a generative model thus looks like this: - -import NNGenerativeModels from './_img/nn_explained_40_generative_models.png'; -import NNGenerativeModelsDark from './_img/nn_explained_40_generative_models_dark.png'; - - - -But, they don’t seem that way when we interact with them. Generative models can output texts of varying lengths. - -How do generative models go from producing one token at a time, to seemingly producing a coherent sequence of text? As it turns out, most (text) generative models achieve this like how many of us perform difficult tasks - by putting one ~~foot~~ token in front of the other, as shown in the next diagram. - -import NNAutoRegressiveModels from './_img/nn_explained_45_auto_regressive.png'; -import NNAutoRegressiveModelsDark from './_img/nn_explained_45_auto_regressive_dark.png'; - - - -In a generative model, each generated token becomes a part of the input sequence, while the previous input sequence moves down by one spot (and the last one dropped). This continues onwards, until the model generates a special token indicating that it is finished generating. - -This is how a model like `gpt-4` can generate coherent answers to a question. Each token is based on the user input and all of the tokens generated up to that point. - -So, when a model is said to have an allowable “context length”, it refers to the maximum length of input tokens (`[x1, x2, …, xn]`). And in many cases, the output length is also limited. This is so that the output tokens don’t cause the model to “forget” the initial input, as the autoregression part of the input grows larger. - -### Why use generative models? - -By performing this simple task of generating one token in front of the other, generative models can perform a remarkable set of tasks at a very high level. - -We’ve seen generative models demonstrate generalized capabilities across domains, such as: - -- Write and debug code in multiple programming languages -- Converse with nuanced understanding -- Summarize complex documents or translate between languages -- Explain difficult concepts at various levels of complexity -- Create content from stories to marketing copy - -Additionally, generative models can perform "zero-shot" or "few-shot" learning, meaning they can tackle tasks they weren't explicitly trained for with minimal or no examples. - -This allows generative models to be used in place of specialized models in some cases, short-cutting lengthy model development processes. - -Somewhat unbelievably, all these abilities are derived from the single goal that a generative mode has - to predict the next token. - -As users of generative models, it is important to keep in mind that the model is simply performing pattern-matching, regardless of how impressive or complex it is, using its billions of parameters. This can help us to not imbue models with unwarranted magical qualities and to treat its outputs with healthy skepticism. - -That is the basics of a generative model. We will learn more about these models later, such as discussing examples of these models, as well as model evaluation and selection. For now, let’s move on to learn about a different type of model, called an embedding model. - -:::tip Advanced topics - -In this section, we largely talked about aspects related to large language models, where a text input is used to generate text outputs. -
- -Many modern generative models are actually multi-modal. There are "vision language models" that can take not just text, but also images as parts of their inputs. And other types of generative models such as Stable Diffusion, or Sora can produce visual outputs, such as images or videos. -
- -These models fundamentally work similarly to what was described above, but with modality-specific aspects that allow them to go even further. - -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/40_embedding_models.mdx b/docs/academy/theory/010_ai_models_deep_dive/40_embedding_models.mdx deleted file mode 100644 index 460dfe902..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/40_embedding_models.mdx +++ /dev/null @@ -1,136 +0,0 @@ ---- -title: Embedding models -description: A look inside embedding models ---- - -import ThemedImage from '@theme/ThemedImage'; - -:::info Embedding models and Weaviate - -Embeddings are critical to vector databases such as Weaviate, as they enable vector/semantic search. Vector databases such as Weaviate allow users to store and search through millions, or even billions, of these embeddings with ease. - -::: - -If generative models are the celebrities of the AI world, embedding models may be its plumbing; they’re not glamorous, but critical parts of the infrastructure. - -Let's take a look at what embedding models are, and why they are so important. - -## How embedding models work - -An embedding model seeks to capture a “meaning” of the input provided to it. The output embedding can then be used later for tasks such as classification, clustering, or most commonly, information retrieval. - -Commonly used embedding models include Cohere’s `embed-multilingual-v3.0`, OpenAI’s `text-embedding-3-large` and Snowflake’s `snowflake-arctic-embed-l-v2.0`. - -These names tend to be more generic and descriptive. Combined with their less glamorous status, these models may not be as well-known. - -But they are no less interesting than generative models, and often share many similarities with them. For example, both of them take a text input and convert it to a numerical format with a tokenizer. One key difference is that where a generative model outputs a token at a time, an embedding model outputs a fixed length (or shape) of numbers. - -import NNEmbeddingModels from './_img/nn_explained_50_embedding_models.png'; -import NNEmbeddingModelsDark from './_img/nn_explained_50_embedding_models_dark.png'; - - - -The concept of numerically representing a meaning may be foreign at first. Let's take a step back and look at a simpler example. - -### An analogy - color encoding - -An analog for this approach can be found in how we represent colors as numbers - whether it be the RGB, CMYK or HSL system. - -Each system can represent any color as a series of numbers. Take the official web color definition of “red” as an example. In RGB it is `[255, 0, 0]`, in CMYK it is `[0, 100, 100, 0]`, and in HSL it is `[0, 100%, 50%]`. - -In other words, each system is a standardized method of representing a color as a set of numbers. - -Embedding models work similarly. For example, the phrase “You’re a wizard, Harry.”, may be represented as: - -`[0.021, -0.103, 0.036, 0.088, -0.022, ..., 0.056]` - -Where the actual sequence length may be quite large, such as 256, 1024, or 1536 values typically. - -To be clear, modern embedding models are far more complex than algorithms that convert colors to RGB values. However, the principle is the same - each system consistently converts its inputs into a sequence of numbers. - -How is this useful? As it turns out, for a variety of tasks. - -## Why use embedding models? - -Earlier, we likened embedding models to numerical systems like RGB that can encode color. The key benefit of embedding models is similar to that of color encoding systems; they enable meaningful comparisons of the source object. - -Going back to RGB colors, “crimson” is `[220, 20, 60]`. You can see that it is quite similar to red’s RGB value of `[255, 0, 0]`, and very different to, say, the RGB value of “aqua”, which is `[0, 255, 255]`. - -| | R | G | B | -| --- | --- | --- | --- | -| Red | 255 | 0 | 0 | -| Crimson | 220 | 20 | 60 | -| Aqua | 0 | 255 | 255 | - -In fact, we can quantify the similarity to a single number. We can use a commonly used metric called a “cosine” similarity. Here is an example implementation using Python: - -```python -import numpy as np - -def cosine_similarity(a: list, b: list) -> float: - # Calculate the cosine similarity between two input lists - return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b)) -``` - -We can put the results of our comparison into a table, into what is called a similarity matrix. Here, each table cell value is the similarity between its corresponding row and the column. - -| | Red | Crimson | Aqua | -| --- | --- | --- | --- | -| Red | 1 | 0.961 | 0 | -| Crimson | 0.961 | 1 | 0.247 | -| Aqua | 0 | 0.247 | 1 | - -The key takeaway from this demonstration is that the system allows us to compare how similar each color is. Conversely, we now have a way in which we can identify the most similar color to a given color in a bag of colors. - -## Applications of embedding models - -Going back to our embedding models, the value of embedding models is that it allows us to identify the object with the most similar meaning to a given object, in a set of objects. - -Take these three pieces of text: - -``` -1. "You're not wizened, Harry." -2. "Harry can wield magic." -3. "Ron is not a great driver." -``` - -Which one might be most similar to `"You're a wizard, Harry."`? - -Most of us would probably answer 2. But why? And how would you get a program to answer in the same way? Note that 2 only includes one overlapping word with the query. - -That’s the task that embeddings enable. Using embeddings and cosine similarity, we see that: - -| | Rank | Cosine distance | -| --- | --- | --- | -| Harry can wield magic. | 1 | 0.238 | -| You're not wizened, Harry. | 2 | 0.274 | -| Ron is not a great driver. | 3 | 0.803 | - -This concept of similarity is used in semantic search. In modern AI systems, semantic search is a critical component of retrieval augmented generation (RAG), helping to complement generative systems by providing it with accurate, up-to-date context to work with. - -Applications of embeddings go even further. Embeddings are used in other AI systems such as recommenders, clustering and classification and so on. - -:::tip Advanced topics - -In this section, we largely talked about aspects related to text embedding models, where a text input is used to generate a vector embedding. -
- -Just like generative models, the world of embedding models is very big and interesting. Multi-modal embedding models can take various input types and produce compatible embeddings in the same space. And modern embeddings may come in different formats, using techniques such as multi-vector embeddings (e.g. ColBERT) or adaptive-length embeddings. -
- -We may touch on them later on, when we get to further in-depth discussions about specific modalities or model selection. - -::: - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/50_model_training.mdx b/docs/academy/theory/010_ai_models_deep_dive/50_model_training.mdx deleted file mode 100644 index 2ed22595c..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/50_model_training.mdx +++ /dev/null @@ -1,94 +0,0 @@ ---- -title: "Briefly: Model training" -description: A very brief introduction to how AI models are trained ---- - -import ThemedImage from '@theme/ThemedImage'; - -In this section, let's very briefly discuss how AI models are trained. This is a critical part of understanding how AI models work, and how they can be used. We will not go into the details of training, but rather provide a high-level overview to help you understand the process. - -## What is model training? - -So far, we’ve learned that AI models, or more specifically, deep learning models, work by applying a massive number of transformations to an input to produce an output. These transformations are primarily applied through `weights`, `biases` and `activation functions`, as shown below in an example calculation for a node. - -import NNNodeCalcsLight from './_img/nn_explained_20_node_calculation.png'; -import NNNodeCalcsDark from './_img/nn_explained_20_node_calculation_dark.png'; - - - -We also learned that the `weights` and `biases` are collectively called `parameters` of a deep learning model. As the name indicates, `parameters` define the behavior of a deep learning model (in contrast to the architecture of a deep learning model). - -:::info - -*Parameter*: n. an arbitrary constant whose value characterizes a member of a system - -Source: Merriam-Webster Dictionary - -::: - -Two deep learning models, even with the same architecture, can have different parameters to perform a different job. - -Given that these models commonly have millions, or billions, of parameters, it is basically impossible to hard-code these parameters to achieve the desired goal. Instead, these parameters are “trained” numerically, which is what leads them to be called “deep *learning*” models. - -## How are models trained? - -The training at a high level is just like any other machine learning model. To a layperson, it is similar to “goal seek” in Microsoft Excel, or even more basic, [the “hot and cold” game](https://en.wikipedia.org/wiki/Hunt_the_thimble). - -In deep learning, each training “iteration” is used to compare the model’s output to the “ideal” output (this is called a “loss function”). Then, a lot of fancy math is used to update the parameters such that the model gets a little bit better at its job. - -:::note Why gloss over the “fancy math”? - -We used the term “fancy math” to whimsically indicate that there is a lot of complex mathematics as well as computation involved. This is, of course, an extremely interesting and important area of AI. -
- -The key here is to use “gradient descent”, to update parameters over time using numerical differentiation of the loss function. This technique calculates the direction in which parameters should change to reduce the model's error, and then makes small adjustments in that direction. -
- -But it is slightly outside the scope of our discussion here, and perhaps not as critical for those looking to simply use models. If you are interested in learning about model training in detail, there’s a lot of great resources out there. -
- -Here are a couple of our favorites: -- [Video: Let’s reproduce GPT-2 by Andrej Karpathy](https://www.youtube.com/watch?v=l8pRSuU81PU) -- [Fine-tune large language models: Hugging Face](https://huggingface.co/learn/nlp-course/en/chapter11/1) - -::: - - -import NNBackpropagation from './_img/nn_explained_55_backprop.png'; -import NNBackpropagationDark from './_img/nn_explained_55_backprop_dark.png'; - - - -You will see that this process works backwards, propagating the learnings from the model output back to the parameters. That’s why this is called “back propagation”, or “backprop”. - -In a model training process, this type of update would be one “iteration”. Many, many, such iterations are carried out throughout the lifecycle of model training. It is the combination of the large size of these models, and the number of iterations required for training that makes it so expensive, and time-consuming, to train these models. - -## Models: Build vs. buy - -Training a large language model from scratch can take months and cost millions of dollars in computing resources. Modern GPUs are uniquely suited for this application, which is why they have been in high demand in the recent past. - -Modern AI models are trained in multiple stages, called “pre-training” and “fine-tuning”. A simplified explanation is that pre-training gives a model its general capabilities, while fine-tuning adapts it for specific domain or application capabilities. - -In a majority of cases, users will be selecting models off-the-shelf, although fine-tuning a model can be a good solution for the right use case, as long as it is done with the sufficient expertise and goal in mind. We will discuss this more in our later section on model evaluation and selection. - -Next, let’s start to get into how these models can actually be used. We will look at how to access such models, such as through commercial inference providers or by performing local inference. And we’ll review their pros and cons, as well as begin to interpret details of various models by reading model cards. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/60_using_models.mdx b/docs/academy/theory/010_ai_models_deep_dive/60_using_models.mdx deleted file mode 100644 index 3d06186ea..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/60_using_models.mdx +++ /dev/null @@ -1,500 +0,0 @@ ---- -title: Using AI models -description: Examples of using AI models for inference ---- - -import ThemedImage from '@theme/ThemedImage'; - -Up until this point of the module, we have been largely discussing theoretical aspects of AI models. Hopefully, this has helped you to develop, or solidify, a foundational understanding of these models. - -But we have not forgotten that AI models are more than just impressive scientific advancements. They are tools for us to use so that we can be more productive. Let’s now take that understanding and start to translate it to practical usage. - -In this section, we begin to show examples of AI model usage. Once a model is built, the step of using, or running, a model to produce outputs is also called “performing inference”. - -## Inference: The basics - -import NNInference from './_img/nn_explained_80_inference.png'; -import NNInferenceDark from './_img/nn_explained_80_inference_dark.png'; - - - -The available model inference options can be quite extensive, from the inference modality to the model provider and model itself. As a result, the permutations of possible decisions can easily be overwhelming. - -So, this section is designed to give you an organized overview of popular choices. - -First we will cover different ways or modes of using these models - whether to use an inference provider, or perform local inference. Then, within each mode, we will show some examples of performing inference through popular providers or software libraries. - -This should remove some of the intimidation and mystique from the range of options, and set the ground work for our later discussions on model evaluation and selection. - -:::note this section does not use Weaviate. - -You may already know that Weaviate integrates with model providers to make inference easier. In this section, however, we will access the model ourselves. This will help you to understand what is going on under-the-hood when Weaviate performs inference on your behalf later on. - -::: - -## Inference via service providers - -The lowest-friction method for accessing modern AI models is to use a web API provided by an inference service provider. - -Thanks to the explosion of popularity for certain types of AI models, there are numerous inference providers (and APIs) available that almost anyone can sign up and use. - -Popular inference providers include Anthropic, AWS, Cohere, Google, Microsoft, OpenAI, and many, many more. - -Not all models are available at all inference providers. Some inference providers also develop their own, proprietary models, while others specialize in simply providing an inference service. - -Let’s take a look at examples of performing inference through Cohere. - -:::info Will this cost me anything? - -At the time of writing, Cohere offered API access that were free of charge, with caveats / limitations. Please review the provider’s latest terms and conditions for details. We note that if you do use paid services, the volume of inference performed for these sections are very small, and costs will be relatively small (under US$1). - -### Preparation - -For this section, we will use Cohere. [Cohere](https://cohere.com/) develops a range of generative, embedding and re-ranker models. Cohere’s models are available through other inference services, as well as Cohere itself. Here, we will use the Cohere API directly. - -:::info -At the time of writing, Cohere offered a trial key which is available free of charge. -::: - -To use the Cohere API, [sign up for an account](https://cohere.com/), and then navigate to their dashboard. There, you should be able to navigate to a section for `API keys`, where you can manage your API keys. - -Create a trial key, which should be sufficient for this usage. Set the API key as an environment variable, with the name `COHERE_API_KEY`. - -Install the Cohere SDK with your preferred environment with your preferred package manager. For example: - -```bash -pip install cohere -``` - -### Embedding model usage - -The following snippet will convert a series of text snippets (`source_texts`) into embeddings: - -```python -import cohere -import os - -cohere_api_key = os.getenv("COHERE_API_KEY") -co = cohere.ClientV2(api_key=cohere_api_key) - -source_texts = [ - "You're a wizard, Harry.", - "Space, the final frontier.", - "I'm going to make him an offer he can't refuse.", -] - -response = co.embed( - texts=source_texts, - model="embed-english-light-v3.0", - input_type="search_document", - embedding_types=["float"], -) - -source_embeddings = [] -for e in response.embeddings.float_: - print(len(e)) # This will be the length of the embedding vector - print(e[:5]) # This will print the first 5 elements of the embedding vector - source_embeddings.append(e) # Save the embedding for later use -``` - -Note that for saving source texts to search though later on, we specify the input type `search_document` here. - -This should output something like this (note the exact numbers may vary): - -``` -384 -[0.024459839, 0.039001465, -0.013053894, 0.016342163, -0.049926758] -384 -[-0.0051002502, 0.017578125, -0.0256958, 0.023513794, 0.018493652] -384 -[-0.076660156, 0.04244995, -0.07366943, 0.0019054413, -0.007736206] -``` - -Printing for each vector its length (dimensionality) and the first few dimensions. - -Then, to find the piece of text that best matches a query (let’s say: `intergalactic voyage`), we would first embed the query text: - -```python -# Get the query embedding: -query_text = "Intergalactic voyage" - -response = co.embed( - texts=[query_text], - model="embed-english-light-v3.0", - input_type="search_query", - embedding_types=["float"], -) - -query_embedding = response.embeddings.float_[0] - -print(len(query_embedding)) -print(query_embedding[:5]) -``` - -This should produce: - -``` -384 -[-0.007019043, -0.097839355, 0.023117065, 0.0049324036, 0.047027588] -``` - -Indicating that our query vector is the same dimensionality as the document vector, and that each dimension has a similar format. - -To perform a vector search: - -```python -# Find the most similar source text to the query: -import numpy as np - -# Calculate the dot product between the query embedding and each source embedding -dot_products = [np.dot(query_embedding, e) for e in source_embeddings] - -# Find the index of the maximum dot product -most_similar_index = np.argmax(dot_products) - -# Get the most similar source text -most_similar_text = source_texts[most_similar_index] - -print(f"The most similar text to '{query_text}' is:") -print(most_similar_text) -``` - -This should produce the output: - -``` -The most similar text to 'Intergalactic voyage' is: -Space, the final frontier. -``` - -Hopefully, you will agree that this makes good intuitive sense. If you are curious, try varying the source texts, and/or the query texts. - -The embedding model does its best to capture meaning, but it isn’t perfect. A particular embedding model will work better with particular domains or languages. - -### Generative model usage - -Now, let’s use one of Cohere’s large language models. We will ask it to explain how a large language model works: - -```python -import cohere -import os - -cohere_api_key = os.getenv("COHERE_API_KEY") -co = cohere.ClientV2(api_key=cohere_api_key) - -messages = [ - { - "role": "user", - "content": "Hi there. Please explain how language models work, in just a sentence or two.", - } -] - -response = co.chat( - model="command-r-plus", - messages=messages, -) - -print(response.message.content[0].text) - -``` - -The response may look something like this (note the exact output may vary): - -``` -Language models are artificial intelligence systems that generate and understand human language by analyzing vast amounts of text data and learning patterns, structures, and context to create responses or translations. These models use machine learning algorithms to create statistical representations of language, enabling them to produce human-like text output. -``` - -If you’ve seen a web interface such as Claude AI or ChatGPT, you would be familiar with multi-turn conversations. - -In an API, you can achieve the same result by simply providing the preceding conversations to the LLM: - -```python -import cohere -import os - -cohere_api_key = os.getenv("COHERE_API_KEY") -co = cohere.ClientV2(api_key=cohere_api_key) - -messages = [ - { - "role": "user", - "content": "Hi there. Please explain how language models work, in just a sentence or two.", - } -] - -# Initial response from the model -response = co.chat( - model="command-r-plus", - messages=messages, -) - -# Append the initial response to the messages -messages.append( - { - "role": "assistant", - "content": response.message.content[0].text, - } -) - -# Provide a follow-up prompt -messages.append( - { - "role": "user", - "content": "Ah, I see. Now, can you write that in a Haiku?", - } -) - -response = co.chat( - model="command-r-plus", - messages=messages, -) - -# This response will take both the initial and follow-up prompts into account -print(response.message.content[0].text) -``` - -The response may look like this: - -``` -Language models, oh -Patterns and words, they dance -New text, probabilities. -``` - -Notice that because the entire message history was included, the language model correctly responded, using the message history as context. - -This is quite similar to what happens in applications such as Claude AI or ChatGPT. As you type in your answers, the entire message history is being used to perform model inference. - -You’ve now seen how model inference works using Cohere’s web-based APIs. In this pattern, the models are hosted online and run remotely. Next, we’ll take a look at an example where we run these models locally. - -## Local inference - -In many cases, it may be desirable or even required to perform AI model inference using a local (or on-premise) model. - -This may be brought on by a variety of reasons. You may wish (or need) to keep the data local (e.g. compliance or security), or have a custom-trained proprietary model. Or, the economics may be preferable for local inference over commercial inference APIs. - -While there are arguably fewer options for local inference than those offered by inference providers, the range of choices is still quite wide. There are huge number of publicly available models as well as software libraries to make the process easier. As well as general deep learning libraries such as PyTorch or TensorFlow, libraries such as Hugging Face Transformers, Ollama and ONNX Runtime make it easier to perform local inference. In the case of Ollama and ONNX Runtime, at reasonable speeds without any hardware (GPU / TPU) acceleration. - -Let’s take a look at examples of performing inference through Ollama. - -:::info Model licenses - -Just like any other product, AI models often come with a particular license that details what you can and cannot do. - -When it comes to publicly available models, keep in mind that not all of them allow commercial usage. Consult each model’s license to evaluate for yourself whether it is suitable for your use case. - -::: - -### Preparation - -For this section, we will use Ollama. -[Ollama](https://ollama.com/) is a an open-source framework for running and deploying AI models locally. It provides an easy way to download, set up, and interact with a variety of open-source models like Llama, Mistral, and Snowflake embedding models. Ollama offers both a command-line interface and a REST API, as well as programming language-specific SDKs. - -:::info System requirements - -We will be performing local inference in this section. Even though we will use relatively small models, these AI models require somewhat significant system resources. We recommend using a modern computer, with at least 16 GB of RAM. - -A GPU is not required. - -::: - -To use Ollama, go to the site and follow the download and installation instructions. - -Then, pull the required models. We will use the 1 billion parameter Gemma3 generative model, and the 110 million parameter Snowflake Arctic embedding model. - -Once you have Ollama installed, pull the models with: - -```bash -ollama pull gemma3:1b -ollama pull snowflake-arctic-embed:110m -``` - -Now, check that the models are loaded by running: - -```bash -ollama list -``` - -The resulting output should include the `gemma3:1b` and `snowflake-arctic-embed:110m` models. - -Install the Ollama Python library with your preferred environment with your preferred package manager. For example: - -```bash -pip install ollama -``` - -### Embedding model usage - -The following snippet will convert a series of text snippets (`source_texts`) into embeddings: - -```python -import ollama - -source_texts = [ - "You're a wizard, Harry.", - "Space, the final frontier.", - "I'm going to make him an offer he can't refuse.", -] - -response = ollama.embed(model='snowflake-arctic-embed:110m', input=source_texts) - -source_embeddings = [] -for e in response.embeddings: - print(len(e)) # This will be the length of the embedding vector - print(e[:5]) # This will print the first 5 elements of the embedding vector - source_embeddings.append(e) # Save the embedding for later use -``` - -This should output something like this (note the exact numbers may vary): - -``` -768 -[-0.030614788, 0.01759585, -0.001181114, 0.025152, 0.005875709] -768 -[-0.039889574, 0.05197108, 0.036466435, 0.012909834, 0.012069418] -768 -[-0.04942698, 0.05466185, -0.007884168, -0.00252788, -0.0025294009] -``` - -Printing for each vector its length (dimensionality) and the first few dimensions. (Note the number of dimensions here are different to the Cohere example, as the two models vary in their *dimensionality*.) - -Let’s follow the same steps. First, we find the piece of text that best matches a query (let’s say: `intergalactic voyage`), we would first embed the query text: - -```python -# Get the query embedding: -query_text = "Intergalactic voyage" - -response = ollama.embed(model='snowflake-arctic-embed:110m', input=query_text) - -query_embedding = response.embeddings[0] - -print(len(query_embedding)) -print(query_embedding[:5]) -``` - -Producing a result such as: - -``` -768 -[-0.043455746, 0.05260946, 0.025877617, -0.017234074, 0.027434561] -``` - -Again, here our query vector is the same dimensionality as the document vector, and that each dimension has a similar format. - -To perform a vector search: - -```python -# Find the most similar source text to the query: -import numpy as np - -# Calculate the dot product between the query embedding and each source embedding -dot_products = [np.dot(query_embedding, e) for e in source_embeddings] - -# Find the index of the maximum dot product -most_similar_index = np.argmax(dot_products) - -# Get the most similar source text -most_similar_text = source_texts[most_similar_index] - -print(f"The most similar text to '{query_text}' is:") -print(most_similar_text) -``` - -Note the snippet to compare embeddings is identical to that used in the Cohere example. This should produce the output: - -``` -The most similar text to 'Intergalactic voyage' is: -Space, the final frontier. -``` - -Happily for us, the Snowflake model also identified the same space-related passage as the closest one out of the candidates. - -### Generative model usage - -Now, let’s move to try using a large language model with Ollama, using the `gemma3:1b` model. We will once again ask to explain how a large language model works: - -```python -from ollama import chat -from ollama import ChatResponse - -messages = [ - { - "role": "user", - "content": "Hi there. Please explain how language models work, in just a sentence or two.", - } -] - -response: ChatResponse = chat(model='gemma3:1b', messages=messages) - -print(response.message.content) -``` - -The response may look something like this (note the exact output may vary): - -``` -Language models, like me, are trained on massive amounts of text data to predict the next word in a sequence, essentially learning patterns and relationships within language to generate text that seems coherent and relevant. -``` - -As before, we can perform a multi-turn conversation: - -```python -from ollama import chat -from ollama import ChatResponse - -messages = [ - { - "role": "user", - "content": "Hi there. Please explain how language models work, in just a sentence or two.", - } -] - -# Initial response from the model -response: ChatResponse = chat(model='gemma3:1b', messages=messages) - -# Append the initial response to the messages -messages.append( - { - "role": "assistant", - "content": response.message.content, - } -) - -# Provide a follow-up prompt -messages.append( - { - "role": "user", - "content": "Ah, I see. Now, can you write that in a Haiku?", - } -) - -response: ChatResponse = chat(model='gemma3:1b', messages=messages) - -# This response will take both the initial and follow-up prompts into account -print(response.message.content) -``` - -The response (for me) looked like this: - -``` -Words flow, patterns bloom, -Digital mind learns to speak, -Meaning takes new form. -``` - -Although the specific response and syntax were different, the general workflow and principles were the same between using an inference provider and a local model. - -So the question is - how do you go about making these choices? - -As alluded to earlier, this is an important, but huge topic, which we will tackle later. But we will tackle some of the factors in the next section to get you started. We’ll look at how to broadly choose between an inference provider and a local model, as well as how to read model cards. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/70_strategy.mdx b/docs/academy/theory/010_ai_models_deep_dive/70_strategy.mdx deleted file mode 100644 index 71a9e704f..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/70_strategy.mdx +++ /dev/null @@ -1,131 +0,0 @@ ---- -title: Navigating the landscape -description: How to navigate the vast, complex world of AI models ---- - -## Access mode selection - -When it comes to deploying AI models in your applications, choosing between commercial inference providers and local inference is a good step to narrow down your range of choices. - -Each approach has distinct advantages and trade-offs that should be considered based on your specific use case. - -### Inference service providers - -Examples: AWS, Cohere, Google, Microsoft Azure, OpenAI, etc. - -**Advantages:** - -- No infrastructure management or hardware investment required -- Access to proprietary, provider-specific models - - Regular model updates & improvements -- Automatic scaling to handle varying workloads - -**Disadvantages:** - -- Generally higher costs than local or self-managed inference -- Data privacy considerations (data leaves your environment) -- Dependency on third-party service availability - - What happens if the provider ceases to operate -- Limited customization options - -### Local Inference - -Example: Hugging Face Transformers / Accelerate, llama.cpp, Ollama, PyTorch Serve, TensorFlow Serving - -**Advantages:** - -- Control data transmissions -- No internet dependency for inference operations -- Ability to deploy custom trained models - -**Disadvantages:** - -- Higher upfront costs for hardware procurement -- Unlikely to be able to access the latest, most powerful models -- Responsibility for model updates and maintenance -- Potential performance constraints based on available hardware - -### Decision Factors - -Consider the following questions when making your selection: - -1. **Data sensitivity**: Does your application process confidential or regulated data? If your data cannot leave your premises, this may limit your options to particular inference providers, or local models. -2. **Inference volume**: How many requests do you expect to process daily/monthly? For many, the up-front costs for local models may be prohibitive. -3. **Latency requirements**: How time-sensitive are your model responses? If this is high, it may preclude many small-scale users from using local models. -4. **Budget constraints**: Is your priority upfront savings or long-term cost optimization? -5. **Technical resources**: Do you have the expertise to manage local model deployment, and potentially training? - -Note that there isn’t a one-size-fits-all solution. - -However, it may be that for getting started, using a commercial inference provider may be an easier, lower-friction choice. - -## How to read model cards - -Model “cards” are to AI models what product labels or specification sheets are to regular products. - -Model cards are supplied by the model provider to help you understand what the model is and how to best use it. - -**Examples of model cards** - -Model cards come in a variety of formats. See, for example, the following cards for embedding models: - -- [https://huggingface.co/Cohere/Cohere-embed-english-v3.0](https://huggingface.co/Cohere/Cohere-embed-english-v3.0) -- [https://docs.cohere.com/v2/docs/cohere-embed](https://docs.cohere.com/v2/docs/cohere-embed) -- [https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0](https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0) - -And some cards for generative AI models: - -- [https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct](https://huggingface.co/meta-llama/Llama-3.3-70B-Instruct) -- [https://ollama.com/library/llama3.3:70b](https://ollama.com/library/llama3.3:70b) -- [https://ai.google.dev/gemma/docs/core/model_card_3](https://ai.google.dev/gemma/docs/core/model_card_3) -- [https://build.nvidia.com/microsoft/phi-4-multimodal-instruct/modelcard](https://build.nvidia.com/microsoft/phi-4-multimodal-instruct/modelcard) - -Even from this small selection, you can see that these cards vary according to the hosting site and the model provider. This volume of information can be overwhelming, especially at first. - -import ModelCards from './_img/model_cards.png'; - -Model Cards - -We will get into these in more detail in later courses. But here are some key parameters to look out for. - -1. **Basic model information** - - Model name and version - - Model type (generative, embedding, etc.) - - Model size (parameter count) and architecture - - Training data overview - - Costs, if accessed through an API, or hardware requirements -2. **Technical specifications** - - Dimensionality (for embedding models) - - Context length (for generative models) - - Supported languages or modalities -3. **Performance metrics** - - Benchmark results - - Known strengths and limitations - - Downstream performance -4. **Usage information** - - Intended use cases - - Implementation guidelines or code snippets -5. **Legal and ethical considerations** - - License type and usage restrictions - - Potential biases or fairness considerations - -Just reviewing these parameters will take you a long way towards identifying models that will suit your needs. - -In the absence of other information, a good starting point would be to filter for: - -- The right model type -- Suitable modality, language and context length or dimensionality -- Ability to access model (inference provider / local inference) -- License suitability - -Then, select candidate models from a reputable model provider, or based on their benchmark performance. - -Ultimately you may wish to evaluate the model’s, and your application’s, performance yourself. - -But following these simple heuristics will get you a long way towards selecting a good baseline model or a set of baseline models. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/10_chat_example_1.png b/docs/academy/theory/010_ai_models_deep_dive/_img/10_chat_example_1.png deleted file mode 100644 index 4acea1134..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/10_chat_example_1.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/10_chat_example_2.png b/docs/academy/theory/010_ai_models_deep_dive/_img/10_chat_example_2.png deleted file mode 100644 index 827526443..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/10_chat_example_2.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/10_copilot_example.png b/docs/academy/theory/010_ai_models_deep_dive/_img/10_copilot_example.png deleted file mode 100644 index fbdd72aa8..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/10_copilot_example.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/10_cute_dog.png b/docs/academy/theory/010_ai_models_deep_dive/_img/10_cute_dog.png deleted file mode 100644 index b119a577a..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/10_cute_dog.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/10_generated_unicorn.png b/docs/academy/theory/010_ai_models_deep_dive/_img/10_generated_unicorn.png deleted file mode 100644 index f00a76cc1..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/10_generated_unicorn.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/model_cards.png b/docs/academy/theory/010_ai_models_deep_dive/_img/model_cards.png deleted file mode 100644 index 45ecf3d96..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/model_cards.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_01_basic.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_01_basic.png deleted file mode 100644 index 4c381868b..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_01_basic.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_01_basic_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_01_basic_dark.png deleted file mode 100644 index 55031837e..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_01_basic_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_05_basic_labelling.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_05_basic_labelling.png deleted file mode 100644 index 3a547fee7..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_05_basic_labelling.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_05_basic_labelling_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_05_basic_labelling_dark.png deleted file mode 100644 index 8748a902c..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_05_basic_labelling_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_10_inputs_and_outputs.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_10_inputs_and_outputs.png deleted file mode 100644 index 5af800cc0..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_10_inputs_and_outputs.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_10_inputs_and_outputs_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_10_inputs_and_outputs_dark.png deleted file mode 100644 index 8da006b7d..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_10_inputs_and_outputs_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_20_node_calculation.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_20_node_calculation.png deleted file mode 100644 index 408ff3307..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_20_node_calculation.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_20_node_calculation_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_20_node_calculation_dark.png deleted file mode 100644 index 229748857..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_20_node_calculation_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_25_total_parameters.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_25_total_parameters.png deleted file mode 100644 index 17d3fd818..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_25_total_parameters.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_25_total_parameters_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_25_total_parameters_dark.png deleted file mode 100644 index f0760a239..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_25_total_parameters_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_30_final_output.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_30_final_output.png deleted file mode 100644 index e0f9026cd..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_30_final_output.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_30_final_output_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_30_final_output_dark.png deleted file mode 100644 index 146da2fb2..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_30_final_output_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_40_generative_models.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_40_generative_models.png deleted file mode 100644 index 7f878f1b4..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_40_generative_models.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_40_generative_models_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_40_generative_models_dark.png deleted file mode 100644 index 8a54645c0..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_40_generative_models_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_45_auto_regressive.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_45_auto_regressive.png deleted file mode 100644 index 7a2ba4f09..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_45_auto_regressive.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_45_auto_regressive_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_45_auto_regressive_dark.png deleted file mode 100644 index 86223a488..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_45_auto_regressive_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_50_embedding_models.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_50_embedding_models.png deleted file mode 100644 index 244e3ec6f..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_50_embedding_models.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_50_embedding_models_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_50_embedding_models_dark.png deleted file mode 100644 index 882af08c2..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_50_embedding_models_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_55_backprop.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_55_backprop.png deleted file mode 100644 index 8fd9095fd..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_55_backprop.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_55_backprop_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_55_backprop_dark.png deleted file mode 100644 index a372dbc00..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_55_backprop_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_80_inference.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_80_inference.png deleted file mode 100644 index 43b6927e4..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_80_inference.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_80_inference_dark.png b/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_80_inference_dark.png deleted file mode 100644 index eecc0f80d..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/nn_explained_80_inference_dark.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/_img/tokenization.png b/docs/academy/theory/010_ai_models_deep_dive/_img/tokenization.png deleted file mode 100644 index d2e2dbee7..000000000 Binary files a/docs/academy/theory/010_ai_models_deep_dive/_img/tokenization.png and /dev/null differ diff --git a/docs/academy/theory/010_ai_models_deep_dive/index.mdx b/docs/academy/theory/010_ai_models_deep_dive/index.mdx deleted file mode 100644 index 6447fdb99..000000000 --- a/docs/academy/theory/010_ai_models_deep_dive/index.mdx +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: "10 AI models: A gentle deep dive" -description: A practical introduction to AI models for software engineers or AI builders. -sidebar_position: 10 # Like a subject number (e.g. CS101) ---- - -## Unit overview - - - -The world of AI models may be best described as a vast, tall tower of knowledge that is also rapidly expanding. None of you will be surprised to hear that this course will not be able to cover the field comprehensively. - -What we intend do, however, is to give you a detailed overview of things that matter for AI builders. - -### Prerequisites - -- None - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/101_hello_weaviate/10_intro_weaviate.mdx b/docs/academy/theory/101_hello_weaviate/10_intro_weaviate.mdx deleted file mode 100644 index 05871e614..000000000 --- a/docs/academy/theory/101_hello_weaviate/10_intro_weaviate.mdx +++ /dev/null @@ -1,173 +0,0 @@ ---- -title: Introduction to Weaviate -description: Introduction to Weaviate Theory ---- - -## What is Weaviate? - -import ReactPlayer from 'react-player/lazy' - - -
- -Weaviate is an open-source [vector database](https://weaviate.io/blog/what-is-a-vector-database). But what does that mean? Let's unpack it here. - -### Vector database - -Weaviate is a fantastic tool for retrieving the information you need, quickly and accurately. It does this by being an amazing **vector database**. - -You may be familiar with traditional databases such as relational databases that use SQL. A database can catalog, store and retrieve information. A **vector** database can carry out these tasks also, with the key difference being that they can perform these tasks based on similarity. - -#### How traditional searches work - -Imagine that you are searching a relational database containing articles on cities, to retrieve a list of "major" European cities. Using SQL, you might construct a query like this: - -```sql -SELECT city_name wiki_summary -FROM wiki_city -WHERE (wiki_summary LIKE '%major European city%' OR - wiki_summary LIKE '%important European city%' OR - wiki_summary LIKE '%prominent European city%' OR - wiki_summary LIKE '%leading European city%' OR - wiki_summary LIKE '%significant European city%' OR - wiki_summary LIKE '%top European city%' OR - wiki_summary LIKE '%influential European city%' OR - wiki_summary LIKE '%notable European city%') - (… and so on) -``` - -Which would return cities that contained any of these strings (`major`, `important`, `prominent`, ... etc) in the `wiki_summary` column. - -This works well in many circumstances. However, there are two significant limitations with this approach. - -#### Limitations of traditional search - -Using this type of search requires you to identify terms that *may* have been used to describe the concept, which is no easy feat. - -What's more, this doesn't solve the problem of how to rank the list of resulting objects. - -With the above search query, an entry merely containing a mention of a different European city (i.e. not very relevant) would be given equal weighting to an entry for Paris, or Rome, which would be highly relevant. - -A vector database makes this job simpler by enabling searches based on similarity. - -#### Examples of vector search - -Instead of searching for an exact match, you could perform a query to find objects that are "nearest" to "Major European city". - -What it would then return is a list of entries that are *ranked by their similarity* to the query. - -In other words, the results would reflect their similarity to the idea, or meaning, of "Major European city". - -What's more, Weaviate "indexes" the data based on their similarity, making this type of data retrieval lightning-fast. - -Weaviate can help you to do all this, and actually a lot more. Another way to think about Weaviate is that it supercharges the way you use information. - -:::info Vector vs semantic search -A vector search is also referred to as a "semantic search" because it returns results based on the similarity of meaning (therefore "semantic"). -::: - -### Open-source - -Weaviate is open-source. In other words, its [codebase is available online](https://github.com/weaviate/weaviate) for anyone to see and use[(1)](#-notes). - -And that is *the* codebase, regardless of how you use it. So whether you run Weaviate on your own computer, on a cloud computing environment, or through our managed service [Weaviate Cloud (WCD)](https://console.weaviate.cloud/), you are using the exact same technology. - -So, if you want, you can run Weaviate for free on your own device, or use our managed service for convenience. You can also take comfort in that you can see exactly what you are running, and be a part of the open-source community, as well as to shape its development. - -It also means that your knowledge about Weaviate is fungible, between local, cloud, or managed instances of Weaviate. So anything you learn here about Weaviate using WCD will be equally applicable to running it locally, and vice versa. 😉 - -### Information, made dynamic - -We are used to thinking of information as static, like a book. But with Weaviate and modern AI-driven language models, we can do much more than just retrieve static information but easily build on top of it. Take a look at these examples: - -#### Question answering - -Given a list of Wikipedia entries, you could ask Weaviate: - -:::note We asked Weaviate: -When was Lewis Hamilton born? -::: - -And it would answer with: - -:::note Weaviate responded: -Lewis Hamilton was born on January 7, 1985. ([check for yourself](https://en.wikipedia.org/wiki/Lewis_Hamilton)) -::: - -#### Generative search - -Or you can synthesize passages using retrieved information with Weaviate: - -Here is one, where we searched Weaviate for an entry on a "racing driver", and produce the result in the format of: - -:::note We asked Weaviate: -Write a fun tweet encouraging people to read about this: ## \{title} by summarizing highlights from: ## \{wiki_summary} -::: - -Which produces: - -:::note Weaviate responded: -Check out the amazing story of Lewis Hamilton, the 7-time Formula One World Drivers' Championship winner! From his humble beginnings to becoming one of the world's most influential people, his journey is an inspiring one. #LewisHamilton #FormulaOne #Motorsport #Racing -::: - -We will cover these and many more capabilities, such as vectorization, summarization and classification, in our units. - -For now, keep in mind that Weaviate is a vector database at its core which can also leverage AI tools to do more with the retrieved information. - -## Review - -In this section, you learned about what Weaviate is and how it works at a very high level. You have also been introduced to what vector search is at a high level, that it is a similarity-based search method. - -### Review exercises - - - - - - - -### Key takeaways - -- Weaviate is an open source vector database. -- The core Weaviate library is the same whether you run it locally, on the cloud, or with WCD. -- Vector searches are similarity-based searches. -- Weaviate can also transform your data after retrieving it before returning it to you. - -## Notes - -(1) Subject to terms of its license, of course. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const weaviateOpenSource = [ - { - questionText: 'What is the difference in the Weaviate codebase between local and cloud deployments?', - answerOptions: [ - { answerText: 'Cloud deployments always include additional modules.', isCorrect: false, feedback: 'Cloud deployments of Weaviate do not include any special, or additional, modules.'}, - { answerText: 'Local deployments are optimized for GPU use.', isCorrect: false, feedback: 'GPU usage can be enabled for inference whether locally or remotely deployed.'}, - { answerText: 'Cloud deployments are optimized for scalability.', isCorrect: false, feedback: 'We agree that cloud deployments should be optimized for scalability. But the Weaviate codebase is built for scalability regardless of deployment location.'}, - { answerText: 'None, they are the same.', isCorrect: true, feedback: 'They are the same, open-source codebase available on GitHub.'}, - ], - }, -]; -export const vectorSearchDefinition = [ - { - questionText: 'What is the best description of vector search?', - answerOptions: [ - { answerText: 'Vector search is a directional search.', isCorrect: false, feedback: 'The definition of "vector" in this context is not direction-related.'}, - { answerText: 'Vector search is a similarity-based search.', isCorrect: true, feedback: 'It searches a data collection or database for proximity in its representation of "meaning".'}, - { answerText: 'Vector search is a number-based search.', isCorrect: false, feedback: 'This is partially true, but not the best answer. While there are numbers involved, that description does not quite capture the key concept of vector searches.'}, - ], - }, -]; diff --git a/docs/academy/theory/101_hello_weaviate/15_overview_vectors.mdx b/docs/academy/theory/101_hello_weaviate/15_overview_vectors.mdx deleted file mode 100644 index fe8a4191a..000000000 --- a/docs/academy/theory/101_hello_weaviate/15_overview_vectors.mdx +++ /dev/null @@ -1,110 +0,0 @@ ---- -title: Vectors - An overview -description: Vector Concepts in Weaviate Theory ---- - -## What is a vector? - -import ReactPlayer from 'react-player/lazy' - - -
- -We've covered that Weaviate is a [vector database](https://weaviate.io/blog/what-is-a-vector-database), and that a vector search is similarity-based. But, what is a vector? - -A vector in this context is just a series of numbers like `[1, 0]` or `[0.513, 0.155, 0.983, ..., 0.001, 0.932]`. Vectors like these are used to capture meaning. - -This might seem like an odd concept. But in fact, you may have already used vectors to capture meaning without realizing it. If you have tried photo editing, or used MS Paint you might have encountered the RGB color system. - -### How do numbers represent meaning? - -The RGB system uses groups of three numbers to represent colors. For example: - -- (255, 0, 0) = red -- (80, 200, 120) = emerald - -In these examples, each number can be thought of as a dial for how red, green or blue a color is. - -Now, imagine having hundreds, or even thousands, of these dials. That’s how vectors are used to represent meaning. Modern machine learning models such as GPT-x, or those used with Weaviate, use vectors to represent some "essence", or "meaning" of objects. This can be done for any object type, such as text, code, images, videos and more. - -## Vector embeddings in Weaviate - - - -The vector representation of an object's meaning is called a "vector embedding". - -Weaviate enables vector searches by indexing and storing data objects and their corresponding vector embeddings. The vector embeddings come from machine learning models. - -In plain terms, Weaviate processes and organizes your data in such a way that objects can be retrieved based on their similarity to a query. To perform these tasks at speed, Weaviate does two things that traditional databases do not. Weaviate: - -- Quantifies similarity -- Indexes vector data - -These operations enable Weaviate to do what it does. - -### Quantifying similarity - -As we've mentioned, vector searches are similarity-based, but what does that actually mean? How do we determine that two pieces of data are "similar"? What does it mean for two pieces of text, two images, or two objects in general, to be similar? - -This is a relatively simple idea that is actually incredibly interesting and intricate once we start to dive into the details. - -But for now, you should know that machine learning (ML) models are the key to this whole process. The ML models that power vector searches share similarities with those that generate text responses from prompts. Instead of generating new text, these (vectorizer) models capture the "meaning" of text or other media. We will cover this in more detail later on. - -### Indexing (vector) data - -Vector searches can be very intensive computationally. - -To overcome this problem, Weaviate uses a combination of indexes including an approximate nearest neighbor (ANN) index and an inverted index. The ANN index lets Weaviate perform extremely fast vector searches. The inverted index lets Weaviate filter data using Boolean criteria. - -We will get into this in more detail later - but for now, it's enough to know that Weaviate can perform fast vector searches as well as filtering. - -## Review - -In this section, you learned about what vectors are and how Weaviate utilizes them at a very high level. You have also been introduced to two of Weaviate's key capabilities that help it to enable vector search at speed. - -### Review exercise - -:::tip Self-reflection -Can you describe, in your own words, what vectors are? -::: - - - -### Key takeaways - -- A vector is a series of numbers that capture the meaning or essence of objects. -- Machine learning models help quantify similarity between different objects, which is essential for vector searches. -- Weaviate uses a combination of approximate nearest neighbor (ANN) index and an inverted index to perform fast vector searches with filtering. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - - -import Quiz from '/src/components/Academy/quiz.js' -export const howWeaviateWorks = [{ - questionText: 'Which of these statements are true?', - answerOptions: [ - { - answerText: 'Weaviate has no way of quantifying similarity between objects.', - isCorrect: false, - feedback: 'Weaviate performs vector searches, which is similarity-based.', - }, - { - answerText: 'The only type of index in Weaviate is the vector index.', - isCorrect: false, - feedback: 'In addition to the vector index, Weaviate uses an inverted index.', - }, - { - answerText: 'Weaviate is a machine learning model.', - isCorrect: false, - feedback: 'While Weaviate can be used with a variety of different models which help it determine object similarity, it is itself not a machine learning model. Weaviate is a vector database.', - }, - { - answerText: 'None of the above', - isCorrect: true, - feedback: 'All of these are false!', - }, - ] -}]; diff --git a/docs/academy/theory/101_hello_weaviate/20_examples_1.mdx b/docs/academy/theory/101_hello_weaviate/20_examples_1.mdx deleted file mode 100644 index 92a26378e..000000000 --- a/docs/academy/theory/101_hello_weaviate/20_examples_1.mdx +++ /dev/null @@ -1,96 +0,0 @@ ---- -title: Examples 1 - Queries -description: Hello Weaviate Theory - Examples Part 1 ---- - -## Vectors in action - -import ReactPlayer from 'react-player/lazy' - - -
- -Let's take a look at a few more examples of what you can do with Weaviate. - -First, we will try vector searches by searching through our demo database. You will learn how to use Weaviate to retrieve objects based on their similarity, using various query types such as an input text, vector, or object. - -You will also compare vector search with keyword search to compare and contrast the two techniques, before learning how to combine the two techniques through the use of filters. - -### Vector search demo - -For our first example, let's search our demo dataset. It contains a small sample of questions from the quiz show *Jeopardy!*. - -Imagine that you're running a quiz night, and you want to get some questions about "animals in movies". In a traditional database you could look for word matches, perhaps something like: - -```sql -SELECT question, answer -FROM jeopardy_questions -WHERE ( - question LIKE '%animal%' - OR question LIKE '%creature%' - OR question LIKE '%beast%' - ) -AND ( - question LIKE '%movie%' - OR question LIKE '%film%' - OR question LIKE '%picture%' - OR question LIKE '%cinema%' - ) - -``` - -This is a difficult query to write. Even worse, you would probably have to add the names of specific animals to the query as well. - -The Weaviate query is much more intuitive. See what happens when we run the following query: - -:::note We searched Weaviate for: -animals in movies -::: - -Weaviate retrieved these as the top answers: - -:::note Weaviate retrieved: -- **meerkats**: Group of mammals seen here like Timon in *The Lion King* -- **dogs**: Scooby-Doo, Goofy & Pluto are cartoon versions -- **The Call of the Wild Thornberrys**: Jack London story about the dog Buck who joins a Nick cartoon about Eliza, who can talk to animals -::: - -Note how relevant the results are, despite none of them including the word "animal" or the word "movie", let alone both! - -This is exactly why vector searches are so useful. They can identify related objects without the need to match exact text. - -### Vector similarities - -In vector searches, you can retrieve the actual quantified similarity between the query and the returned objects. This is called the `distance`. - -A `distance` is indicative of the degree of similarity between the returned object and the query. - -If you're wondering exactly what that means, and who decides how similar any two objects or concepts are, those are great questions! We will cover those in more detail later. - -For now, just keep in mind that smaller distances mean two objects are more similar to each other. - -## Review - - - - - -### Key takeaways - -- Vector searches can identify related objects without the need for exact text matches. -- In vector searches, distance values indicate the degree of similarity between the returned object and the query. -- Smaller distances indicate greater similarity. -- Vector searches can be combined with keyword searches and filtering techniques for more refined search results. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/101_hello_weaviate/25_examples_2.mdx b/docs/academy/theory/101_hello_weaviate/25_examples_2.mdx deleted file mode 100644 index f7439b5fe..000000000 --- a/docs/academy/theory/101_hello_weaviate/25_examples_2.mdx +++ /dev/null @@ -1,109 +0,0 @@ ---- -title: Examples 2 - More than search -description: Hello Weaviate Theory - Examples Part 2 -sidebar_position: 25 ---- - -## Beyond vector searches - -import ReactPlayer from 'react-player/lazy' - - -
- -You can do a lot more with Weaviate than simply retrieve static information. - -Let's take a look at a couple of examples where we do more than simply retrieve objects from the database. - -We will extract information from this Wikipedia entry. - -
- "The Sydney Opera House" Wikipedia summary - -The Sydney Opera House is a multi-venue performing arts centre in Sydney. Located on the foreshore of Sydney Harbour, it is widely regarded as one of the world's most famous and distinctive buildings and a masterpiece of 20th-century architecture. Designed by Danish architect Jørn Utzon, but completed by an Australian architectural team headed by Peter Hall, the building was formally opened by Queen Elizabeth II on 20 October 1973 after a gestation beginning with Utzon's 1957 selection as winner of an international design competition. The Government of New South Wales, led by the premier, Joseph Cahill, authorised work to begin in 1958 with Utzon directing construction. The government's decision to build Utzon's design is often overshadowed by circumstances that followed, including cost and scheduling overruns as well as the architect's ultimate resignation. The building and its surrounds occupy the whole of Bennelong Point on Sydney Harbour, between Sydney Cove and Farm Cove, adjacent to the Sydney central business district and the Royal Botanic Gardens, and near to the Sydney Harbour Bridge. - -The building comprises multiple performance venues, which together host well over 1,500 performances annually, attended by more than 1.2 million people. Performances are presented by numerous performing artists, including three resident companies: Opera Australia, the Sydney Theatre Company and the Sydney Symphony Orchestra. As one of the most popular visitor attractions in Australia, the site is visited by more than eight million people annually, and approximately 350,000 visitors take a guided tour of the building each year. The building is managed by the Sydney Opera House Trust, an agency of the New South Wales State Government. - -On 28 June 2007, the Sydney Opera House became a UNESCO World Heritage Site, having been listed on the (now defunct) Register of the National Estate since 1980, the National Trust of Australia register since 1983, the City of Sydney Heritage Inventory since 2000, the New South Wales State Heritage Register since 2003, and the Australian National Heritage List since 2005. The Opera House was also a finalist in the New7Wonders of the World campaign list. - -
- -Weaviate creates [data objects](/weaviate/concepts/data) when it processes the Wikipedia entry. The data objects are stored in classes. A class is roughly analogous to a table in a relational database. An object is similar to an entry in that table. - - -### Generative search - -Weaviate can do even more with these entries. You can ask Weaviate to grab an object from its data store and use that object to generate new text. For example, Weaviate can use the object that contains the entry for the Sydney Opera House to derive new text. - -Here is a `GraphQL` query example of a generative search. - -```graphql -{ - Get { - WikiArticle( - nearText: { - concepts: ["Sydney Opera House"] - } - limit: 1 - ) { - title - wiki_summary - _additional { - generate( - singleResult: { - prompt: """ - Write a fun tweet encouraging people to read about this: ## {title} - by summarizing highlights from: ## {wiki_summary} - """ - } - ) { - singleResult - error - } - } - } - } -} -``` - -The sample code generates a Tweet based on the Wikipedia entry! - -
- See response - -:::note Weaviate says: -Explore the world-famous Sydney Opera House and its incredible architecture! From the iconic design to the amazing performances, there's something for everyone to enjoy. #SydneyOperaHouse #Explore #Architecture #Performances #Experience -::: - -
- -This process is an example of `generative search`. In a generative search, Weaviate retrieves information, and then leverages a large language model (LLM) to re-shape it. This is a powerful feature that can transform how you deal with information. - -You can vary the prompt to generate different results. - -### What next? - -Tools like Q&A and generative search really start to bring your information to life. - -## Review - - - -### Key takeaways - -- Weaviate can extract knowledge from text using question-answering capabilities, identifying the most relevant object and the actual answer based on the provided text. -- Generative search allows you to retrieve information and reshape or repurpose the content, such as generating a tweet based on a Wikipedia entry. -- These advanced capabilities of Weaviate transform how you interact with and utilize information in your data. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/101_hello_weaviate/90_next_steps.mdx b/docs/academy/theory/101_hello_weaviate/90_next_steps.mdx deleted file mode 100644 index 3b0a234cf..000000000 --- a/docs/academy/theory/101_hello_weaviate/90_next_steps.mdx +++ /dev/null @@ -1,40 +0,0 @@ ---- -title: Next steps ---- - -After reading this theoretical overview of Weaviate, you may be interested in getting hands-on. If so, we recommend you to check out one of the Python or TypeScript/JavaScript courses in the Academy. - -We have the following beginner courses available for Python: -- [101T Work with: Text data](../../py/starter_text_data/index.md) -- [101V Work with: Your own vectors](../../py/starter_custom_vectors/index.md) -- [101T Work with: Multimodal data](../../py/starter_multimodal_data/index.md) - -And for TypeScript/JavaScript: -- [Introduction to Weaviate with TypeScript](/academy/js/starter_text_data) - -### Academy - -- [Named vectors](../../py/named_vectors/index.md): Learn how to use named vectors to flexibly represent data in Weaviate. -- [Which search is right for me?](../../py/standalone/which_search/index.mdx): Learn about the different types of searches in Weaviate and when to use them. -- [Chunking](../../py/standalone/chunking/index.mdx): Learn how to use chunking to optimize your search for longer documents. - -### Documentation - -- How-to guides - - The [How-to: Manage collections](/weaviate/manage-collections/index.mdx) and [How-to: Manage objects](/weaviate/manage-objects/index.mdx) guides show how to perform data operations (i.e. create, read, update, delete collections and objects within them).. - - [How-to: search](/weaviate/search/index.mdx): Code examples for all types of search operations. - - [How-to: configure Weaviate](/weaviate/configuration/index.mdx): Guides for configuring Weaviate, such as [PQ](/weaviate/configuration/compression/pq-compression.md) and [BQ](/weaviate/configuration/compression/bq-compression.md) compression, [backups](/deploy/configuration/backups.md) and [replication](/deploy/configuration/replication.md). -- [Concepts guides](/weaviate/concepts/index.md): Guides for understanding Weaviate's architecture and concepts. -- [API reference](/weaviate/api/index.mdx): Detailed information about Weaviate's APIs. - -import CTASocials from '../../py/_snippets/cta_socials.mdx'; - - - -See you soon! 👋 - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/101_hello_weaviate/index.mdx b/docs/academy/theory/101_hello_weaviate/index.mdx deleted file mode 100644 index 8015e0a6f..000000000 --- a/docs/academy/theory/101_hello_weaviate/index.mdx +++ /dev/null @@ -1,42 +0,0 @@ ---- -title: 101 Hello, Weaviate -description: A basic introduction to Weaviate and its data management capabilities. -sidebar_position: 101 # Like a subject number (e.g. CS101) ---- - -## Unit overview - -import ReactPlayer from 'react-player/lazy' - - -
- - - -Welcome! - -This is the beginning of your journey through the world of vectors with Weaviate. This unit will provide you with an overview of the fundamentals of Weaviate. - -You'll first gain an understanding of what Weaviate is, and what it can do. You will then learn about what vector database and vector search are before going on to run Weaviate and perform vector searches yourself. - -By the end of this unit, you will have a strong foundation of knowledge that will help you to effectively navigate the rest of the course, and for using Weaviate in your own projects. - -### Prerequisites - -- None - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/150_search_types/_index.md b/docs/academy/theory/150_search_types/_index.md deleted file mode 100644 index 59fc51359..000000000 --- a/docs/academy/theory/150_search_types/_index.md +++ /dev/null @@ -1,27 +0,0 @@ ---- -title: 150 Search Types -description: An introduction to different search types in Weaviate -sidebar_position: 150 # Like a subject number (e.g. CS101) ---- - -## Unit overview - -### Prerequisites - -- None - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/150_search_types/_notes.md b/docs/academy/theory/150_search_types/_notes.md deleted file mode 100644 index 2528b659a..000000000 --- a/docs/academy/theory/150_search_types/_notes.md +++ /dev/null @@ -1,52 +0,0 @@ -### Vector embeddings - -Vector embeddings are numerical representations of objects in a high-dimensional space. These vectors are generated by a model trained to represent the objects in a way that captures their semantic meaning. - -A vector embedding may look like: - -```json -[0, 1, 0, 0, 1] -``` - -or like: - -```json -[0.158, 0.011, 0.840, 0.577, 0.897, ..., 0.144] -``` - -A vector embedding may be typically between 300 and 2048 dimensions, although the exact number varies depending on the model. The longer the vector, the more information it can capture about the object. On the other hand, they require more resources to store, index and search. - -Vector embeddings that are produced by the same model are said to be "compatible", or that they are "in the same vector space". - -:::tip Analog: Vector embeddings as language -An intuitive way to think of vector embeddings is as a language. Just as two people must speak the same language to communicate, two vectors must be in the same vector space to be compared. If two vectors are not in the same vector space, their values are meaningless when compared, even if they are the same length. -::: - -## Vector distance - -Vector distance indicates how close, or far apart, two vectors are in high-dimensional space. This is a measure of the object's "semantic" similarity to the query, based on their vector embeddings. - -In a simple example, consider colors "SkyBlue", "LightSteelBlue", and "DarkOrange". These colors can be represented as vectors in a 3D space, with the RGB values as the vector components. - -| Color | Vector (R,G,B) | -|----------------|-----------------| -| SkyBlue | (135, 206, 235) | -| LightSteelBlue | (176, 196, 222) | -| DarkOrange | (255, 140, 0) | - -The vectors for "SkyBlue" and "LightSteelBlue" are much closer to each other than either is to "DarkOrange", reflecting their similarity as light blue colors versus an orange color. - -If you search a vector database containing vectors for "SkyBlue" and "DarkOrange" with a query vector for "LightSteelBlue", the search would return "SkyBlue" as the closest match. - -Vector search for far more complex objects, such as text, images, or audio, is based on the same principle. The vectors are generated by a model trained to represent the objects in a high-dimensional space, where the distance between vectors reflects the similarity between the objects. - -### Distance and search quality - -All compatible vectors are similar to some degree search will have some "top" search results, even if the query is not similar to any objects in the dataset. - -If you search a vector database containing vectors for colors "Red", "Crimson" and "LightCoral" with a query vector for "SkyBlue", the search will still return a result (e.g. "Red"), even if it is not semantically similar to the query. The search is simply returning the closest match, even if it is not a good match in the absolute sense. - -To ensure that the search results are meaningful, consider the following strategies: - -- **Use a threshold**: Set a minimum similarity score for the results. This will exclude results that are not similar enough to the query. -- **Apply filters**: Use [filters](../filtering.md) to exclude results based on other criteria, such as metadata or properties. diff --git a/docs/academy/theory/180_embedding_model_selection/10_introduction.mdx b/docs/academy/theory/180_embedding_model_selection/10_introduction.mdx deleted file mode 100644 index 8fa4cf36e..000000000 --- a/docs/academy/theory/180_embedding_model_selection/10_introduction.mdx +++ /dev/null @@ -1,107 +0,0 @@ ---- -title: Overview -description: Why embedding model selection matters ---- - -import ThemedImage from '@theme/ThemedImage'; - -## Why embedding model selection matters - -Embedding models are AI models that capture “meanings” of objects.  This [earlier module on AI models](../010_ai_models_deep_dive/index.mdx) showed that embedding models can do this by turning text, images, audio and more into a sequence of numbers. - -import NNEmbeddingModels from '../010_ai_models_deep_dive/_img/nn_explained_50_embedding_models.png'; -import NNEmbeddingModelsDark from '../010_ai_models_deep_dive/_img/nn_explained_50_embedding_models_dark.png'; - - - -As you might imagine, this is not a trivial task. And there have been huge advancements in the field over the last decade or so. As an illustrative example, let’s take a look at the difference between the performance of two models at either end of that time scale. - -### An example evaluation - -Here is a screenshot from an [example demo application](https://github.com/databyjp/emb_eval_toybox) carrying out embedding evaluation. - -In this example, we look for documents that best match the query `“How do I make chocolate chip cookies from scratch”`, out of a candidate document set of 20 documents. - -Each of the 20 documents in the set has a `“score”` attribute here, where a more relevant object is indicated with a higher score. - -Candidate Documents - -Now, let’s see what happens when we try to retrieve the best matching objects using two different embedding models. We will use the following two models: - -- `FastText (fasttext-en-vectors)` (from 2015; [model card](https://huggingface.co/facebook/fasttext-en-vectors)) -- `snowflake-arctic-embed-l-v2.0` (from 2024; [model card](https://huggingface.co/Snowflake/snowflake-arctic-embed-l-v2.0)) - -Here is a summary of results from a search, using the `FastText` model from 2015: - -Search results from FastText - -The top result identified by the FastText is quite relevant, as it discusses how to correct some potential issues with cookie making. However, it’s less relevant than the idea result, which is a step-by-step recipe. - -The other two, however, are not relevant to the query. While they are recipes, they are not for baking cookies. - -It would be fair to say that there’s quite a bit of room for improvement. - -Here are the results from the `snowflake-arctic-embed-l-v2.0` model, from 2025: - -Search results from Snowflake Arctic - -We see that the `arctic` embeddings correctly identified the ideal top-ranked result. In fact, the top two expected results are included in the top three results for the `arctic` embeddings. Even the other result is relevant to chocolate chip cookies - although perhaps slightly off topic. - -### Evaluation criteria - -We could even compare these models using a standard metric, such as `nDCG@k`. - -For this scenarios, the two models scored: - -| Model | nDCG@10 | -| --- | --- | -| `FastText` | 0.595 | -| `snowflake-arctic-embed-l-v2.0` | 0.908 | - -
- What is nDCG@k? - -`nDCG` is a metric used to evaluate the returned results in information retrieval. It rewards the model for returning the most relevant results at the top of the list. The `@k` indicates that only the top `k` results are considered. - -[Read more](https://weaviate.io/blog/retrieval-evaluation-metrics#normalized-discounted-cumulative-gain-ndcg) - -
- -The size of embeddings produced is another key factor. - -Embeddings can vary greatly in size, from around 300 dimensions to thousands. Imagine a service provider that hosts an AI bot that answers questions about legal cases. A vector database with 1 million documents*, one embedding model (`nv-embed-v2`) could require as much memory as 3.3 TB of memory, while another (`embed-english-light-v3.0` ) might only require 300 GB of memory. (The following chart takes some popular models, and compares how each one would affect memory requirements.) - -Estimated memory requirements for 1 million documents - -These simple examples illustrate some of the impact of embedding model selection. The choice of embedding models can make a huge difference in the quality of your search, your resource requirements, and many more factors. - -There have been huge advancements in the landscape of embedding models over the last 10 to 15 years. In fact, innovations in embedding models continue to occur today. You might have heard of some of these names: word2vec, FastText, GloVe, BERT, CLIP, OpenAI ada, Cohere multi-lingual, Snowflake Arctic, ColBERT, and ColPali. - -Each model (or architecture) brings with it some improvements. It may be in model architecture, training data, training methodology, modality, or efficiency, for instance. - -So in the next few sections, let’s begin to explore a workflow for embedding model selection. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/20_workflow_selection.mdx b/docs/academy/theory/180_embedding_model_selection/20_workflow_selection.mdx deleted file mode 100644 index 6d0a17395..000000000 --- a/docs/academy/theory/180_embedding_model_selection/20_workflow_selection.mdx +++ /dev/null @@ -1,53 +0,0 @@ ---- -title: Workflow for model selection -description: Workflow for embedding model selection ---- - -import ThemedImage from '@theme/ThemedImage'; - -## Overview - -Selecting the right embedding model is a complex task. A big reason for this complexity is that each model will have some strengths and weaknesses that involve trade-offs. - -An obvious trade-off is that between model performance, size and cost. Take a look at the chart below, showing a general correlation between model size and retrieval performance for embedding models. - -Embedding model evaluation - -The chart shows a clear positive relationship between model size and higher performance. This also means that generally, models with better performance will be larger. They will require more memory and compute, which means higher costs and slower speeds. - -In other words, a larger model such as `nv-embed-v2` may perform better at retrieval than a smaller model such as `snowflake-arctic-embed-m-v1.5`, but may cost more to run and/or use. - -But there are many other dimensions to consider. For example: - -- A proprietary model such as a modern `gemini` model may show promising performance, but may not meet a user’s preference for local inference. -- While a model may perform well at a standard benchmark, it may not perform as well if given material from a specialized domain, such as legal, medical, or coding tasks. -- A local model may be cheaper to run, but the organization may lack the expertise and resources for long-term infrastructure maintenance. - -In the face of this complexity, a systematic approach can help you to make an informed decision based on your specific requirements. This is one such approach: - - - -This workflow is made up of four key stages as illustrated in the diagram above: - -1. **Identify your needs**: Clearly articulate a set of requirements or preferences to act as a set of guidelines for the future. -2. **Compile a list of candidate models**: Screen for a set of potentially suitable models based on your identified needs and available information. -3. **Perform detailed evaluation**: Run your own evaluations, for your use case, using your chosen data. -4. **Periodic re-evaluation**: Keep an eye for any changes to your requirements (data, application) or environment (new model, provider) - -Next, we will review each stage one by one. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/22_identify_needs.mdx b/docs/academy/theory/180_embedding_model_selection/22_identify_needs.mdx deleted file mode 100644 index ead5d4059..000000000 --- a/docs/academy/theory/180_embedding_model_selection/22_identify_needs.mdx +++ /dev/null @@ -1,139 +0,0 @@ ---- -title: Identify needs & compile candidates -description: Get started with embedding model selection by identifying your needs and compiling a list of candidate models. ---- - -import ThemedImage from '@theme/ThemedImage'; - -## Identify needs - -A systematic approach to model selection starts with clearly identifying requirements. Organizing these requirements into categories can help ensure you consider all relevant factors when evaluating embedding models. - -Here are some of our key considerations: - -Identify your needs - -### Data Characteristics - -| Factor | Key Questions | Why It Matters | -| --- | --- | --- | -| **Modality** | Are you dealing with text, images, audio, or multimodal data? | Models are built for specific modality/modalities. | -| **Language** | Which languages must be supported? | Models are trained & optimized for specific language(s), leading to trade-offs in performance. | -| **Domain** | Is your data general or domain-specific (legal, medical, technical)? | Domain-specific models (e.g. [medical](https://huggingface.co/blog/abhinand/medembed-finetuned-embedding-models-for-medical-ir)) understand specialized vocabulary and concepts. | -| **Length** | What's the typical length of your documents and queries? | Input token context windows vary between models, from as small as `256` tokens to `8192` tokens for example. However, longer context windows typically require exponentially higher compute and latency. | -| **Asymmetry** | Will your queries differ significantly from your documents? | Some models are built for asymmetric query to document comparisons. So queries like `laptop won't turn on` can easily identify documents like `Troubleshooting Power Issues: If your device fails to boot...`. | - -### Performance Needs - -| Factor | Key Questions | Why It Matters | -| --- | --- | --- | -| **Accuracy** (recall) | How critical is it that all the top results are retrieved? | Higher accuracy requirements may justify more expensive or resource-intensive models. | -| **Latency** | How quickly must queries be processed? | Larger models with better performance often have slower inference times. For inference services, faster services will cost more. | -| **Throughput** | What query volume do you anticipate? Will there be traffic spikes? | Larger models with better performance often have lower processing capacity. For inference services, increased throughput will increase costs. | -| **Volume** | How many documents will you process? | Larger embedding dimensions increase memory requirements for your vector store. This will impact resource requirements and affect costs at scale. | -| **Task type** | Is retrieval the only use case? Or will it also involve others (e.g. clustering or classification) ? | Models have strengths and weaknesses; a model excellent at retrieval might not excel at clustering. This will drive your evaluation & selection criteria. | - -### Operational Factors - -| Factor | Key Questions | Why It Matters | -| --- | --- | --- | -| **Hardware limitations** | What computational resources are available for hosting & inference? | Hardware availability (costs, GPU/TPU availability) will significantly affect your range of choices. | -| **API rate limits** | If using a hosted model, what are the provider's limits? | Rate limits can bottleneck applications, or limit potential growth. | -| **Deployment & maintenance** | What technical expertise and resources are required? | Is self-hosting a model an option, or should you look at API-based hosted options? | - -### Business Requirements - -| Factor | Key Questions | Why It Matters | -| --- | --- | --- | -| **Hosting options** | Do you need self-hosting capabilities, or is a cloud API acceptable? | Self-hosting ➡️ more control at higher operational complexity; APIs ➡️ lower friction at higher dependencies. | -| **Licensing** | What are the licensing restrictions for commercial applications? | Some model licenses or restrictions may prohibit certain use cases. | -| **Long-term support** | What guarantees exist for the model's continued availability? | If a model or business is abandoned, downstream applications may need significant reworking. | -| **Budget** | What are your cost limits and expenditure preferences? | Embedding costs can add up over time, but self-hosting can incur high upfront costs. | -| **Privacy & Compliance** | Are there data privacy requirements or industry regulations to consider? | Some industries require specific models. And privacy requirements may impose hosting requirements. | - -Documenting these requirements creates a clear profile of your ideal embedding model, which will guide your selection process and help you make informed trade-offs. - -## Compile candidate models - -After identifying your needs, create a list of potential embedding models to evaluate. This process helps focus your detailed evaluation on the most promising candidates. - -There are hundreds of embedding models available today, with new ones being released regularly. For this many models, even a simple screening process would be too time-consuming. - -As a result, we suggest identifying an initial list of models with a simple set of heuristics, such as these: - -### Account for model modality - -This is a critical, first-step filter. A model can only support the modality/modalities that it is designed and trained for. - -Some models (e.g. Cohere `embed-english-v3.0`) are multimodal, while others (e.g. Snowflake’s `snowflake-arctic-embed-l-v2.0`) are unimodal. - -No matter how good a model is, a text-only model such as `snowflake-arctic-embed-l-v2.0` will not be able to perform image retrieval. Similarly, a `ColQwen` model cannot be used for plain text retrieval. - -### Favor models already available - -If your organization already uses embedding models for other applications, these are great starting points. They are likely to have been screened, evaluated and approved for use, and accounts/billing already configured. For local models, this would mean that the infrastructure is already available. - -This also extends to models available through your other service providers. - -You may be already using generative AI models through providers such as Cohere, Mistral or OpenAI. Or, perhaps your hyperscaler partners such as AWS, Microsoft Azure or Google Cloud provide embedding models. - -In many cases, these providers will also provide access to embedding models, which would be easier to adopt than those from a new organization. - -### Try well-known models - -Generally, well-known or popular models are popular for a reason. - -Industry leaders in AI such as Alibaba, Cohere, Google, NVIDIA and OpenAI all produce embedding models for different modalities, languages and sizes. Here are a few samples of their available model families: - -| Provider | Model families | -| --- | --- | -| Alibaba | `gte`, `Qwen` | -| Cohere | `embed-english`, `embed-multilingual` | -| Google | `gemini-embedding`, `text-embedding` | -| NVIDIA | `NV-embed` | -| OpenAI | `text-embedding`, `ada` | - -There are also other families of models that you can consider. - -For example, the `ColPali` family of models for image embeddings and `CLIP` / `SigLIP` family of models for multimodal (image and text) are well-known in their respective domains. Then, `nomic`, `snowflake-arctic`, `MiniLM` and `bge` models are some examples of well-known language retrieval models. - -These popular models tend to be well-documented, discussed and widely supported. - -As a result, they tend to be easier than the more obscure models to use, evaluate, troubleshoot and use. - -### Benchmark leaders - -Models that perform well on standard benchmarks may be worth considering. Resources like [MTEB Leaderboard](https://huggingface.co/spaces/mteb/leaderboard) can help identify high-performing models. - -As an example, the screenshot below shows models on MTEB at a size of fewer than 1 billion parameters, sorted by their `retrieval` performance. - -MTEB example - sorted by retrieval performance - -It shows some models that we’ve already discussed - such as the `showflake-arctic`, Alibaba’s `gte`, or BAAI’s `bge` models. - -But additionally, you can see already a number of high-performing models that we hadn’t discussed. Microsoft research's `intfloat/multilingual-e5-large-instruct` or JinaAI’s `jinaai/jina-embeddings-v3` model are both easily discoverable here. - -Note that as of 2025, the MTEB contains different benchmarks to assess different capabilities, such as the linguistic or modality needs. - -When viewing benchmarks, make sure to view the right set of benchmarks, and the appropriate columns. In the example below, note that the page shows results for MIEB (image retrieval), with results sorted by *Any to Any Retrieval*. - -MIEB example - sorted by any to any retrieval - -The MTEB is filterable and sortable by various metrics. So, you can arrange it to suit your preferences and add models to your list as you see fit. - -You should be able to compile a manageable list of models relatively quickly using these techniques. This list can then be manually reviewed for detailed screening. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/24_initial_screening.mdx b/docs/academy/theory/180_embedding_model_selection/24_initial_screening.mdx deleted file mode 100644 index 6e2c3514b..000000000 --- a/docs/academy/theory/180_embedding_model_selection/24_initial_screening.mdx +++ /dev/null @@ -1,150 +0,0 @@ ---- -title: Perform initial screening -description: Use available information to screen the list of models ---- - -import ThemedImage from '@theme/ThemedImage'; - -## Overview - -Once you have a list of candidate models down to a reasonable size (say, 10-20 models maximum), you can start to manually review this list. - -This step can be a screen process comparing your compiled requirements against available model details. In most cases, publicly available models will also include summary information through model cards or other means such as documentation or even related academic papers. - -Model cards - -Some of the readily screenable factors, and how to screen models are shown below: - -## Screening factors - -### Context length - -Input context length is a critical factor to ensure that meaning from the whole document chunks taken into account. Maximum input context lengths vary widely between models, as shown in these examples: - -- `all-MiniLM-L6-v2`: 256 tokens -- Cohere `embed-english-v3.0`: 512 tokens -- `snowflake-arctic-embed-l-v2.0`: 8192 tokens - -Input text exceeding the context length will be ignored. On the other hand, higher allowable context lengths typically require exponentially higher compute and latency. As a result, this is an important tradeoff that includes an interplay with your text chunking strategy. - -:::tip - -Consider what a “chunk” of information to retrieve looks like for your use case. Typically, a model with 512 tokens or higher is sufficient for most use cases. - -::: - -### Model goals & training methodology - -Different embedding models are optimized for different use cases. This informs the model architecture, training data and training methodology. - -Reviewing the model provider’s descriptions and published training details can provide key insights into its suitability for your use case. - -- **Linguistic capabilities**: Some models (e.g. Snowflake’s `snowflake-arctic-embed-l-v2.0`) are multi-lingual, while others are primarily uni-lingual (e.g. Cohere’s `embed-english-v3.0`). These linguistic capabilities come largely from the training data and methodology selection. -- **Domain exposure**: Models trained on specialized domains (e.g., legal, medical, financial) typically perform better for domain-specific applications. -- **Primary tasks**: The provider may have been building a general-purpose embedding model, or one that is particularly focussed on particular tasks. Google’s `gemini-embedding` model appears to be designed with a goal of being a jack-of-all-trades type, state of the art model in all tasks and domains ([release blog](https://developers.googleblog.com/en/gemini-embedding-text-model-now-available-gemini-api/)). On the other hand, Snowflake’s `arctic-embed` 2.0 models appear to be focussed on retrieval tasks ([release blog](https://www.snowflake.com/en/engineering-blog/snowflake-arctic-embed-2-multilingual/)). -- **Base model**: In many cases, an embedding model is trained from an existing model. Any advantages, or shortcomings, of the base model will often carry over to the final model, especially if it is an architectural one such as its context window size or pooling strategy. -- **Training methods (advanced)**: If you have more experience with model training techniques, this is an area that you can use as heuristics as well. For example, models trained with contrastive learning often perform better for retrieval tasks. Additionally, hard negative mining is a technique that is valuable to enhance contrastive learning. - -:::tip - -Select a model whose capabilities align with your goals. For example, if your application requires retrieving paragraphs of text chunks in English, French, German, Mandarin Chinese and Japanese, check the model card and training information. Look for its retrieval performance, and whether these languages were included in the training corpus. - -::: - -### Dimensionality and optimization options - -The dimensionality of embeddings affects both performance and resource requirements. - -As a rule of thumb, your memory requirements for a vector database (any quantization notwithstanding) may be: `4 bytes` * `n dimensions` * `m objects` * `1.5` where `m` is the size of your database, and `n` is the vector dimensionality (`1.5` to account for overhead). - -This means that for, say, 10 million objects, the memory requirements for given models’ full outputs will be: - -- NVIDIA `NV-embed-v2`: `246 GB` -- OpenAI `text-embedding-3-large`: `184 GB` -- `snowflake-arctic-embed-l-v2.0`: `61 GB` -- `all-MiniLM-L6-v2`: `23 GB` - -As you might imagine, this can add significant costs to your infrastructure needs for the vector database. - -At the database end, there are quantization strategies which will reduce the footprint and therefore costs, which we will cover in another course. - -However, certain models can also help in this regard as well. [Matryoshka Representation Learning (MRL)](https://weaviate.io/blog/openais-matryoshka-embeddings-in-weaviate) models like `jina-embeddings-v2` or `snowflake-arctic-embed-l-v2.0` allow for flexible dimensionality reduction by simply truncating the vector. In the case of `snowflake-arctic-embed-l-v2.0`, it can be truncated to `256` dimensions from its original `1024` dimensions, reducing its size to a quarter without much loss in performance. - -:::tip - -Consider how big your dataset is likely to get to, then select your model accordingly, keeping the resulting system requirements in mind. If the requirements are too high and thus out-of-budget, it may set you back to square one when you need to scale up and go to production. - -::: - -### Model size and inference speed - -Model size directly impacts inference speed, which is critical for applications with latency requirements. Larger models generally offer better performance but at the cost of increased computational demands. - -When screening models, consider these aspects: - -| Factor | Implications | -| --- | --- | -| Parameter count | More parameters typically mean better quality but slower inference and higher memory usage | -| Architecture efficiency | Some models are optimized for faster inference despite their size | -| Hardware requirements | Larger models may require specialized hardware (GPUs/TPUs) | - -:::tip - -Given that the inference speed is a function of the model, inference hardware as well as the network latencies, review these factors as a system when screening models’ suitability. - -::: - -### Pricing, availability, and licensing - -The practical aspects of model adoption extend beyond technical considerations. - -Providers offer various pricing structures: - -- **API-based pricing**: Pay-per-token (OpenAI, Cohere) -- **Compute-based pricing**: Based on hardware utilization (Cloud providers) -- **Tiered licensing**: Different capabilities at different price points -- **Open-source**: Free to use, but self-hosting costs apply - -Choice of model and inference type will affect model availability: - -- **Geographic availability**: Some providers don't operate in all regions -- **SLA guarantees**: Uptime commitments and support levels -- **Rate limiting**: Constraints on throughput that may affect your application -- **Version stability**: How frequently models are deprecated or updated - -Additionally, licensing terms vary significantly: - -- **Commercial use restrictions**: Some open models prohibit commercial applications -- **Data usage policies**: How your data may be used by the provider -- **Export restrictions**: Compliance with regional regulations -- **Deployment flexibility**: Whether the model can be deployed on-premises or edge devices - -Always review the specific terms for each model. For example, while models like CLIP are openly available, they may have usage restrictions that affect your application. - -:::tip - -These practical considerations can sometimes outweigh performance benefits. A slightly less accurate model with favorable licensing terms and lower costs might be preferable for many production applications. - -::: - -### Creating your candidate shortlist - -After considering these factors, you can create a prioritized shortlist of models to evaluate in more detail. A good approach is to include a mix of: - -1. **Benchmark leaders**: High-performing models on standard metrics -2. **Resource-efficient options**: Models with smaller footprints or faster inference -3. **Specialized models**: Models that might be particularly well-suited to your domain -4. **Different architectures**: Including diverse approaches increases the chance of finding a good fit - -Aim for 3-5 models in your initial shortlist for detailed evaluation. Including too many models can make the evaluation process unwieldy and time-consuming. - -In the next section, we'll explore how to perform detailed evaluations of these candidate models to determine which one best meets your specific requirements. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/26_detailed_eval.mdx b/docs/academy/theory/180_embedding_model_selection/26_detailed_eval.mdx deleted file mode 100644 index 4baa24274..000000000 --- a/docs/academy/theory/180_embedding_model_selection/26_detailed_eval.mdx +++ /dev/null @@ -1,173 +0,0 @@ ---- -title: Detailed evaluation -description: Once you have a shortlist of models, you can start to evaluate them in detail. ---- - -import ThemedImage from '@theme/ThemedImage'; - -## Introduction - -After identifying your needs and compiling a shortlist of candidate models, you can perform a thorough evaluation. The goal of this stage is to collect concrete evidence to support embedding model selection for your specific use case. - -When evaluating embedding models, it's important to distinguish between two types of evaluation: - -1. **Model evaluation**: Focuses on assessing the embedding model itself through direct metrics. This typically involves measuring performance on standard benchmarks or custom tasks designed to match your specific use case. -2. **Downstream evaluation**: Examines how the embedding model performs within the larger system or application, such as a RAG pipeline or recommendation engine. - -Both types of evaluation are important, but they serve different purposes. Model evaluation helps you understand the intrinsic capabilities of each embedding model. On the other hand, downstream evaluation shows how those capabilities translate to real-world application performance as a system. - -In this guide, let’s focus on model evaluation. This is to help you make initial selections while building your AI powered applications. We will discuss downstream, or system-level, evaluation at a later time. - -We will start this with model evaluation through standard benchmarks. - -## Standard benchmarks - - -Standard benchmarks can be a good place to begin detailed model evaluation. They are created by experts, and in many cases, their results are readily available for consumption with minimal effort. - -When reviewing benchmark results, pay attention to the specific tasks and metrics most relevant to your use case. - -As mentioned earlier when compiling candidate models, the Massive Text Embedding Benchmark (MTEB) is a great starting point that is almost used as an industry standard measure. - -Let's dig a little deeper into how to interpret these benchmark results for a detailed evaluation. The MTEB is composed of over a hundred individual tasks, which are combined to characterize a model’s performance by each task type. For text models, common tasks types include: - -- **Retrieval**: Finding relevant documents based on a query -- **Classification**: Categorizing text into predefined classes -- **Clustering**: Grouping similar texts together -- **Reranking**: Sorting results by relevance -- **Semantic textual similarity (STS)**: Measuring how similar two texts are semantically - -The most common task type for use in AI applications may be retrieval. In fact, we’ve already discussed using retrieval scores from MTEB. But there is a lot more to dig into than the overall score, as the MTEB makes individual task scores available. - -### Review MTEB scores in detail - -Each task type score in MTEB is composed of multiple scores from multiple tasks. The retrieval score, as of April 2025, is made up of scores from 17 different task benchmarks. - -Let’s compare a few models’ performances below through visualizations of scores. These figures compile results from MTEB’s multilingual task set, divided into two subsets for readability. - -This first image includes tasks that uses general domain data, such as the news and encyclopedic data. - -MTEB benchmarks by task - general - -The next image includes tasks that uses more specialized domain data. These benchmarks span a range of areas, including legal, medical, programming, and government data. - -MTEB benchmarks by task - specialized - -The chart shows that certain embedding models at the top of the table, such as `gemini-embedding-exp-03-07` perform quite well across the board compared to the others. But this doesn’t tell the whole story, as a given model often outperforms its average score in particular tasks. - -For example, the `snowflake-arctic-embed` models perform very well with the `LEMBPasskeyRetrieval` task, which is designed to test recall of specific text buried in a longer embedding. And Cohere’s `Cohere-embed-multilingual-v3.0` performs quite well in the MIRACL task, which is a highly multilingual task. - -In fact, it is interesting to note that even though we are looking at MTEB’s multilingual task set, it includes includes a number of tasks with an English-only (or majority) corpus. - -MTEB multilingual tasks example - -So, you may benefit from deriving your own metric that blends these task scores, based on how well each specific task corresponds to your needs. - -You might consider: - -1. **Task relevance**: Does the task match your use case? -2. **Data distribution**: Does the data represent your domain? -3. **Metric relevance**: Are the reported metrics aligned with your requirements? -4. **Recency**: Are the results recent enough to reflect current model capabilities? -5. **Fairness**: Were all models evaluated under comparable conditions? - -For example, if you know that your data definitely will include a blend of languages, you may weight the multi-lingual datasets more heavily than the mono-lingual datasets. And similarly with domain-specific data, such as legal, medical, programming and so on. - -The resulting score may be different from the official blended number, but may be more relevant to your particular use case. - -### Limitations of standard benchmarks - -These third-party benchmarks are very useful, but there are a few limitations that you should keep in mind. The main two limitations are data leakage, and correlation to your needs. - -**Data leakage** - -Because these benchmarks are publicly available, there is a risk that some of the benchmark data ends up in the training data used to build models. This can happen for a number of reasons, especially because there is simply so much data being used in the training process. - -This means that the benchmark result may not be a fair representation of the model’s capabilities, as the model is “remembering” the training data. - -**Correlation to your needs** - -Another limitation is that the standard benchmarks don’t accurately reflect your needs. As you saw, we can aim to find a benchmark that is as close as possible to your actual use case. But it is unlikely that the task, data distribution and metrics are fully aligned with your needs. - -**Mitigation** - -As a result, it is important to take these standard benchmarks with a grain of salt. And in terms of getting further signals, a good complementary exercise is to perform your own benchmarks, which we will look at in the next section. - -## Model evaluation: custom benchmarks - - -While standard benchmarks provide valuable reference points, creating your own custom evaluation can be a fantastic complementary tool to address their limitations. - -Running your own benchmark can sound quite intimidating, especially given how extensive benchmarks such as MTEB are. But it doesn’t need to be. You can do this by following these steps: - -### Set benchmark objectives - -By now, you should have an idea of any gaps in your knowledge set, as well as your given tasks. It might be something like: - -- Which model best retrieves the appropriate related customer reviews about coffee, written primarily in English, French, and Korean? -- Does any model work well across code retrieval in Python and Golang for back-end web code chunks, as well as related documentation snippets? - -The custom benchmark should be designed with an idea of addressing particular questions. - -### Determine metrics to use - -Once the overall goals are defined, the corresponding metrics can follow. - -For example, retrieval performance is commonly measured by one or more of precision, recall, MAP, MRR, and NDCG. - -Each of these measure slightly different aspects of retrieval performance. However, using NDCG is a good starting point. - -NDCG measures the system's ability to correctly sort items based on relevance. Given a query and a dataset ranked for this query, NDCG will reward results for having the higher ranked items higher in the search results. - -It is measured on a score of 0 to 1, where 0 means no ranked items were retrieved, and 1 means all top ranked items were retrieved, and ordered correctly. - -### Curate a benchmark dataset - -A suitable dataset is critical for a benchmark to be meaningful. While such a dataset may already exist, it is common to build or reshape a dataset to suit the benchmark goals and metrics. - -The dataset should aim to: - -- Reflect the retrieval task -- Reflect the task difficulty -- Capture the data distribution -- Include sufficient volume - -This may be the most time consuming part of the process. However, a pragmatic approach can help to make it manageable. A benchmark with as few as 20 objects and a handful of queries can produce meaningful results. - -### Run benchmark - -At this point, run the benchmark using your candidate models. - -As with many other scientific projects, reproducibility and consistency is key here. It is also worth keeping in mind that you may come back to this later on to assess new models, or with updated knowledge about your needs. - -In programming terms, you might modularize aspects, such as embedding creation, dataset loading, metric evaluation, and result presentation. - -### Evaluate the results - -Once the benchmarks are run, it is important to assess the results using quantitative (e.g. NDCG@k numbers) and qualitative (e.g. which objects were retrieved where) means. - -The quantitative results will produce a definitive ranking that you can use, for example to order the models. However, this is subject to many factors, such as dataset composition and metric being used. - -The qualitative results may provide more important insights, such as patterns of failure. For example, may see an embedding model: - -- Regularly fail to retrieve certain types of objects, such as shorter, but very relevant text, favoring longer ones -- Perform better with positively phrased text but not ones with negation in the sentence. -- Struggle with your domain-specific jargon. -- Work well with English and Mandarin Chinese, but not so well with Hungarian, which may be a key language for your data. - -To some extent, these insights may be only discoverable to those with domain familiarity, or those with context on the system being built. Accordingly, qualitative assessment is critically important. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/28_benchmark_example.mdx b/docs/academy/theory/180_embedding_model_selection/28_benchmark_example.mdx deleted file mode 100644 index 51912e7f4..000000000 --- a/docs/academy/theory/180_embedding_model_selection/28_benchmark_example.mdx +++ /dev/null @@ -1,266 +0,0 @@ ---- -title: "Custom benchmarks: an example" -description: An example of running your own benchmarks for embedding model evaluation ---- - -import ThemedImage from '@theme/ThemedImage'; - -## Example custom benchmark - -Here is how you might perform a custom benchmark. - -Imagine your end goal is to implement a [RAG (retrieval augmented generation)](https://weaviate.io/blog/introduction-to-rag) system over your company's technical documentation (e.g., product documentation, code examples, support forum logs). - -You've shortlisted two embedding models (Model A and Model B) to retrieve objects based on MTEB scores and practical considerations. Let’s go through the steps discussed earlier. - -### Set benchmark objectives - -Since your data comes from different sources, you may be concerned that the targets are very diverse, whether it be writing style (informal forum posts vs formal documentation), text lengths (comprehensive snippets vs short answers) or language (code vs English). - -So you may set the goal of testing *how each model deals with the style, length and language variability*. - -### Determine metrics to use - -This is a classic retrieval problem, where some results are more relevant than others. So, we can use an NDCG@k metric. NDCG@k can be calculated as follows: - -```python -def calculate_dcg(relevance_scores: list[int], k: Optional[int] = None) -> float: - """ - Args: - relevance_scores: List of relevance scores (0, 1, or 2) - k: Number of results to consider. If None, uses all results. - """ - if k is not None: - relevance_scores = relevance_scores[:k] - - gains = [2**score - 1 for score in relevance_scores] - dcg = 0 - for i, gain in enumerate(gains): - dcg += gain / np.log2(i + 2) if i > 0 else gain - - return dcg - -def calculate_ndcg( - actual_scores: list[int], ideal_scores: list[int], k: Optional[int] = None -) -> float: - """ - Args: - actual_scores: List of relevance scores in predicted order - ideal_scores: List of relevance scores in ideal order - k: Number of results to consider - """ - dcg = calculate_dcg(actual_scores, k) - idcg = calculate_dcg(ideal_scores, k) - return dcg / idcg if idcg > 0 else 0.0 -``` - -Note: some libraries such as [scikit-learn have built-in implementations](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ndcg_score.html) of NDCG. - -### Curate a benchmark dataset - -The benchmark dataset should be suitable for achieving the stated goal. Since we want to assess *how each model deals with the style, length and language variability*, the dataset might look something like this: - -```python -dataset = { - # Search query - "query": "How to set up a vector index with binary quantization", - # Candidate document set, with scores on a scale of 0-3 - "documents": [ - { - "id": "doc001", - # Highly relevant documentation text - "text": "Each collection can be configured to use BQ compression. BQ can enabled at collection creation time, before data is added to it. This can be done by setting the vector_index_config of the collection to enable BQ compression.", - "score": 3 - }, - { - "id": "doc002", - # Highly relevant, long code example - "text": "from weaviate.classes.config import Configure, Property, DataType, VectorDistances, VectorFilterStrateg\n\nclient.collections.create(\n 'Article',\n # Additional configuration not shown\n vector_index_config=Configure.VectorIndex.hnsw(\n quantizer=Configure.VectorIndex.Quantizer.bq(\n cache=True,\n rescore_limit=1000\n ),\n ef_construction=300,\n distance_metric=VectorDistances.COSINE,\n filter_strategy=VectorFilterStrategy.SWEEPING # or ACORN (Available from Weaviate v1.27.0)\n ),)", - "score": 3 - }, - { - "id": "doc003", - # Highly relevant, short code example - "text": "client.collections.create(\nname='Movie',\nvector_index_config=wc.Configure.VectorIndex.flat(\nquantizer=wc.Configure.VectorIndex.Quantizer.bq()\n))", - "score": 3 - }, - { - "id": "doc004", - # Less relevant forum post, even though the right words appear - "text": "No change in vector size after I set up Binary Quantization\nHello! I was curious to try out how binary quantization works. To embed data I use gtr-t5-large model, which creates 768-dimensional vectors. My database stores around 2k of vectors. My python code to turn PQ on is following: client.schema.update_config(\n 'Document',\n {\n 'vectorIndexConfig': {\n 'bq': {\n 'enabled': True, \n }\n }\n },\n)", - "score": 1 - }, - # And so on ... - { - "id": "doc030", - # Irrrelevant documentation text - "text": "Weaviate stores data objects in collections. Data objects are represented as JSON-documents. Objects normally include a vector that is derived from a machine learning model. The vector is also called an embedding or a vector embedding.", - "score": 0 - }, - ] -} -``` - -The example dataset here contains a mix of document with varying relevance scores. Equally importantly, it includes a mix of document types, lengths, and languages. Ideally, each variable would be sufficiently represented, so that any disparities in retrieval performance would show up. - -### Run benchmark - -Now, follow these steps for each embedding model: - -1. Create embeddings of each document and query -2. Perform retrieval for the top `k` results, using these embeddings -3. Calculate the quantitative metrics (e.g. NDCG@k) -4. Collate results (top k results vs true top k labels) for qualitative analysis - -In pseudocode form, it might look something like this: - -```python -import numpy as np -from typing import List, Dict, Any - -class Document: - """Document with text and relevance score""" - def __init__(self, id, text, relevance_score): - self.id = id - self.text = text - self.relevance_score = relevance_score - -class EmbeddingModel: - """Abstract embedding model interface""" - def __init__(self, name): - self.name = name - - def embed(self, text): - """Generate embedding for text""" - return embedding - -class BenchmarkRunner: - """Runs embedding model benchmarks""" - def __init__(self, queries, documents, models): - self.queries = queries - self.documents = documents - self.models = models - - def run(self, k=10): - """Run benchmark for all models - - Returns: Dict mapping model names to metrics - """ - results = {} - - for model in self.models: - # Get embeddings for all texts - query_embeddings = {q: model.embed(q) for q in self.queries} - doc_embeddings = {doc.id: model.embed(doc.text) for doc in self.documents} - - # Calculate metrics for each query - ndcg_scores = [] - for query, query_emb in query_embeddings.items(): - # Get top k documents by similarity - top_docs = self._retrieve_top_k(query_emb, doc_embeddings, k) - - # Calculate NDCG - ndcg = self._calculate_ndcg(top_docs, query, k) - ndcg_scores.append(ndcg) - - # Store results - results[model.name] = { - 'avg_ndcg': np.mean(ndcg_scores), - 'all_scores': ndcg_scores - } - - return results - - def _retrieve_top_k(self, query_emb, doc_embeddings, k): - """Retrieve top k docs by similarity""" - # Implementation: calculate similarities and return top k - pass - - def _calculate_ndcg(self, retrieved_docs, query, k): - """Calculate NDCG@k for retrieved documents""" - # Implementation: calculate DCG and IDCG - pass - -# Example usage -def run_benchmark_example(): - # 1. Initialize data - queries = ["How to set up binary quantization"] - documents = [ - Document("doc1", "BQ can be enabled at collection creation...", 3), - # other documents ... - Document("doc2", "Weaviate stores data objects in collections...", 0) - ] - - # 2. Initialize models - models = [ - # Model implementations... - ] - - # 3. Run benchmark - runner = BenchmarkRunner(queries, documents, models) - results = runner.run(k=5) - - # 4. Print results - for model_name, metrics in results.items(): - print(f"{model_name}: NDCG@5 = {metrics['avg_ndcg']:.4f}") -``` - -### Evaluate the results - -Once the benchmarks are run, you'll have a set of results to analyze. Combine both quantitative metrics and qualitative observations to get a complete picture of model performance. - -#### Quantitative analysis - -Start by comparing the overall metrics for each model: - -```python -# Example benchmark results -results = { - 'Model A': {'avg_ndcg': 0.87, 'all_scores': [0.92, 0.85, 0.84]}, - 'Model B': {'avg_ndcg': 0.79, 'all_scores': [0.95, 0.72, 0.70]} -} - -# Print summary -for model_name, metrics in results.items(): - print(f"{model_name}: NDCG@10 = {metrics['avg_ndcg']:.4f}") -``` - -Look beyond the averages to understand: - -- **Score distribution**: Does a model perform consistently, or excel in some areas while failing in others? -- **Performance by query type**: Group scores by query characteristics (length, complexity, domain) -- **Statistical significance**: For larger benchmark sets, determine if differences are statistically significant - -#### Qualitative analysis - -Examining actual retrieval results often reveals more actionable insights: - -1. **Identify patterns in successes and failures** - - Does a model struggle with certain document types? (code, long-form text) - - Are there consistent mismatches between queries and retrieved documents? -2. **Compare results across models** - - Do models prioritize different aspects of relevance? - - Where do models disagree most significantly? -3. **Domain-specific considerations** - - Are technical terms and jargon handled appropriately? - - How well do models interpret domain context? - -### Making the final decision - -With both quantitative and qualitative insights, you can make an informed decision that balances: - -- **Raw performance**: Which model achieves the best metrics? -- **Specific strengths**: Does a model excel in areas most critical to your application? -- **Practical considerations**: Remember factors like cost, latency, and deployment requirements - -And remember to take results of the standard benchmark in this evaluation process as well. - -The ideal model isn't necessarily the one with the highest average score. It's the one that best addresses your specific requirements and performs well on the queries and document types that matter most to your application. - -Note that this evaluation process isn't just about selecting a model—it's also about understanding its strengths and limitations. This knowledge will help you design more effective systems around the embedding model and set appropriate expectations for its performance. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/40_periodic_re_evaluation.mdx b/docs/academy/theory/180_embedding_model_selection/40_periodic_re_evaluation.mdx deleted file mode 100644 index 86a0093ee..000000000 --- a/docs/academy/theory/180_embedding_model_selection/40_periodic_re_evaluation.mdx +++ /dev/null @@ -1,28 +0,0 @@ ---- -title: Periodic re-evaluation -description: Perform periodic re-evaluation of embedding models ---- - -import ThemedImage from '@theme/ThemedImage'; - -Selecting the best embedding model is a major milestone, but it is worth noting that this is not a one-time activity. - -The field evolves rapidly, with new models being released regularly that may offer significant improvements. Furthermore, when integrated at system level, the embedding model may behave differently to expectations. - -Therefore, consider a periodic, or monitoring for a need to, re-evaluate your embedding model choices: - -- **Monitor benchmark leaderboards**: Check resources like MTEB to identify promising new models -- **Track performance metrics**: If you notice performance degradation in your application, it may trigger a review. -- **Review changing requirements**: As your data distribution, languages, or domains change, your original model selection criteria may need updating - -When significant changes occur in either your requirements or available models, simply repeat the selection and evaluation process described in this module. - -This can ensure consistent, repeatable principles are applied to your embedding model selection process as both your application and the technology landscape evolve. - -By treating model selection as an ongoing process rather than a fixed decision, you'll maintain the quality and effectiveness of your AI applications over time. - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/180_embedding_model_selection/_img/candidate_documents.png b/docs/academy/theory/180_embedding_model_selection/_img/candidate_documents.png deleted file mode 100644 index 5ddcee7b8..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/candidate_documents.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/embedding_eval_example_1_fasttext.png b/docs/academy/theory/180_embedding_model_selection/_img/embedding_eval_example_1_fasttext.png deleted file mode 100644 index d0b36975f..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/embedding_eval_example_1_fasttext.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/embedding_eval_example_2_arctic2.png b/docs/academy/theory/180_embedding_model_selection/_img/embedding_eval_example_2_arctic2.png deleted file mode 100644 index dd727ebbd..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/embedding_eval_example_2_arctic2.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_evaluation.png b/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_evaluation.png deleted file mode 100644 index 9c8bd240d..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_evaluation.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_selection_workflow.png b/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_selection_workflow.png deleted file mode 100644 index fcf8ef739..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_selection_workflow.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_selection_workflow_dark.png b/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_selection_workflow_dark.png deleted file mode 100644 index b845474c4..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/embedding_model_selection_workflow_dark.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/identify_needs_overview.png b/docs/academy/theory/180_embedding_model_selection/_img/identify_needs_overview.png deleted file mode 100644 index fbc8b725a..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/identify_needs_overview.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/memory-reqs-1m-docs.png b/docs/academy/theory/180_embedding_model_selection/_img/memory-reqs-1m-docs.png deleted file mode 100644 index c9752c43c..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/memory-reqs-1m-docs.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/mieb_by_any_to_any.png b/docs/academy/theory/180_embedding_model_selection/_img/mieb_by_any_to_any.png deleted file mode 100644 index 4d0011f7a..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/mieb_by_any_to_any.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/model_cards.png b/docs/academy/theory/180_embedding_model_selection/_img/model_cards.png deleted file mode 100644 index 45ecf3d96..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/model_cards.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/mteb-benchmarks-by-task-general.png b/docs/academy/theory/180_embedding_model_selection/_img/mteb-benchmarks-by-task-general.png deleted file mode 100644 index ccabab9ed..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/mteb-benchmarks-by-task-general.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/mteb-benchmarks-by-task-specialized.png b/docs/academy/theory/180_embedding_model_selection/_img/mteb-benchmarks-by-task-specialized.png deleted file mode 100644 index 70f115388..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/mteb-benchmarks-by-task-specialized.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/mteb-tasks-example.png b/docs/academy/theory/180_embedding_model_selection/_img/mteb-tasks-example.png deleted file mode 100644 index 0520cb51d..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/mteb-tasks-example.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/_img/mteb_by_retrieval.png b/docs/academy/theory/180_embedding_model_selection/_img/mteb_by_retrieval.png deleted file mode 100644 index 28d6bbd79..000000000 Binary files a/docs/academy/theory/180_embedding_model_selection/_img/mteb_by_retrieval.png and /dev/null differ diff --git a/docs/academy/theory/180_embedding_model_selection/index.mdx b/docs/academy/theory/180_embedding_model_selection/index.mdx deleted file mode 100644 index 46732ee9f..000000000 --- a/docs/academy/theory/180_embedding_model_selection/index.mdx +++ /dev/null @@ -1,33 +0,0 @@ ---- -title: "180 Embedding model evaluation & selection" -description: "Learn how to evaluate and select embedding models for your use case." -sidebar_position: 180 # Like a subject number (e.g. CS101) ---- - -## Unit overview - - - -Embedding models form a cornerstone of modern retrieval systems. Recent developments and subsequent proliferation of embedding models have greatly improved their capabilities. But this also makes model selection a very challenging task with a vast set of ever-expanding options. - -This module will tackle how to navigate this landscape, and teach skills to screen, evaluate and select models. - -### Prerequisites - -- None - -## Learning objectives - -import LearningGoalsExp from '/src/components/Academy/learningGoalsExp.mdx'; - - - -import LearningGoals from '/src/components/Academy/learningGoals.jsx'; - - - -## Questions and feedback - -import DocsFeedback from '/_includes/docs-feedback.mdx'; - - diff --git a/docs/academy/theory/_category_.json b/docs/academy/theory/_category_.json deleted file mode 100644 index e1eb208b3..000000000 --- a/docs/academy/theory/_category_.json +++ /dev/null @@ -1,4 +0,0 @@ -{ - "label": "Theory", - "position": 90 -} \ No newline at end of file diff --git a/docs/cloud/index.mdx b/docs/cloud/index.mdx index 14e460bb1..5f4fcf46e 100644 --- a/docs/cloud/index.mdx +++ b/docs/cloud/index.mdx @@ -39,7 +39,3 @@ import WCDLandingGetStarted from '/_includes/wcs/wcs-landing-get-started.mdx' import SupportAndTrouble from '/_includes/wcs/support-and-troubleshoot.mdx'; - -import CustomScriptLoader from '/src/components/scriptSwitch'; - - diff --git a/docs/cloud/manage-clusters/connect.mdx b/docs/cloud/manage-clusters/connect.mdx index dc32e1652..b5d4a6c7e 100644 --- a/docs/cloud/manage-clusters/connect.mdx +++ b/docs/cloud/manage-clusters/connect.mdx @@ -205,7 +205,3 @@ To authenticate with a Weaviate client library, see the following: import SupportAndTrouble from '/_includes/wcs/support-and-troubleshoot.mdx'; - -import CustomScriptLoader from '/src/components/scriptSwitch'; - - diff --git a/docs/cloud/platform/multi-factor-auth.mdx b/docs/cloud/platform/multi-factor-auth.mdx index 76d2f3aa7..b5d87e717 100644 --- a/docs/cloud/platform/multi-factor-auth.mdx +++ b/docs/cloud/platform/multi-factor-auth.mdx @@ -93,7 +93,3 @@ Use API keys to connect browser based client applications to Weaviate Cloud. import SupportAndTrouble from '/_includes/wcs/support-and-troubleshoot.mdx'; - -import CustomScriptLoader from '/src/components/scriptSwitch'; - - diff --git a/docs/cloud/platform/support-levels.mdx b/docs/cloud/platform/support-levels.mdx index 6641152dc..e16959199 100644 --- a/docs/cloud/platform/support-levels.mdx +++ b/docs/cloud/platform/support-levels.mdx @@ -161,7 +161,3 @@ To estimate costs for different combinations of usage levels and support plans, import SupportAndTrouble from '/_includes/wcs/support-and-troubleshoot.mdx'; - -import CustomScriptLoader from '/src/components/scriptSwitch'; - - diff --git a/docs/cloud/platform/users-and-organizations.mdx b/docs/cloud/platform/users-and-organizations.mdx index 333614fbe..d8fcce64e 100644 --- a/docs/cloud/platform/users-and-organizations.mdx +++ b/docs/cloud/platform/users-and-organizations.mdx @@ -203,7 +203,3 @@ The account gets an invoice for each cluster. Clusters are billed on the 1st day import SupportAndTrouble from '/_includes/wcs/support-and-troubleshoot.mdx'; - -import CustomScriptLoader from '/src/components/scriptSwitch'; - - diff --git a/docs/contributor-guide/weaviate-docs/style-guide.mdx b/docs/contributor-guide/weaviate-docs/style-guide.mdx index 1b20c7c80..4380f7bd5 100644 --- a/docs/contributor-guide/weaviate-docs/style-guide.mdx +++ b/docs/contributor-guide/weaviate-docs/style-guide.mdx @@ -52,8 +52,6 @@ The documentation is organized into key sections. When contributing, understand - **Weaviate Cloud - `/docs/cloud`:** Documentation for Weaviate's managed cloud service. Contains account setup procedures, Weaviate embeddings service documentation, billing and subscription management, service limitations, and cloud-specific configuration options. -- **Academy - `/docs/academy`:** Educational content and learning materials for Weaviate concepts and implementations. Provides structured learning paths and tutorials for different skill levels. - - **Integrations - `/docs/integrations`:** Documentation for third-party tools and frameworks that work with Weaviate. Covers client libraries, data import tools, visualization platforms, and other ecosystem integrations. - **Contributor guide - `/docs/contributor-guide`:** Documentation for contributing to Weaviate's open source projects. Includes development setup instructions, coding standards, testing procedures, and contribution workflows for the database, modules, client libraries, and contextionary components. diff --git a/docs/deploy/configuration/env-vars/index.md b/docs/deploy/configuration/env-vars/index.md index 287fd5f47..62956f988 100644 --- a/docs/deploy/configuration/env-vars/index.md +++ b/docs/deploy/configuration/env-vars/index.md @@ -38,7 +38,7 @@ import APITable from '@site/src/components/APITable'; | `DISABLE_TELEMETRY` | Disable [telemetry](/deploy/configuration/telemetry.md) data collection | boolean | `false` | | `DISK_USE_READONLY_PERCENTAGE` | If disk usage is higher than the given percentage all shards on the affected node will be marked as `READONLY`, meaning all future write requests will fail. See [Disk Pressure Warnings and Limits for details](/deploy/configuration/persistence.md#disk-pressure-warnings-and-limits). | `string - number` | `90` | | `DISK_USE_WARNING_PERCENTAGE` | If disk usage is higher than the given percentage a warning will be logged by all shards on the affected node's disk. See [Disk Pressure Warnings and Limits for details](/deploy/configuration/persistence.md#disk-pressure-warnings-and-limits). | `string - number` | `80` | -| `ENABLE_API_BASED_MODULES` | Enable all API-based modules. (Experimental as of `v1.26.0`) | `boolean` | `true` | +| `ENABLE_API_BASED_MODULES` | Enable all API-based modules. | `boolean` | `true` | | `ENABLE_MODULES` | Specify Weaviate modules to enable | `string - comma separated names` | `text2vec-openai,generative-openai` | | `ENABLE_TOKENIZER_GSE` | Enable the [`GSE` tokenizer](/weaviate/config-refs/collections.mdx) for use | `boolean` | `true` | | `ENABLE_TOKENIZER_KAGOME_JA` | Enable the [`Kagome` tokenizer for Japanese](/weaviate/config-refs/collections.mdx) for use (Experimental as of `v1.28.0`) | `boolean` | `true` | diff --git a/docs/weaviate/client-libraries/python/async.md b/docs/weaviate/client-libraries/python/async.md index 2abc729cc..ebf39caa4 100644 --- a/docs/weaviate/client-libraries/python/async.md +++ b/docs/weaviate/client-libraries/python/async.md @@ -22,6 +22,12 @@ For asynchronous operations, use the `WeaviateAsyncClient` async client, availab The `WeaviateAsyncClient` async client largely supports the same functions and methods as the `WeaviateClient` [synchronous client](./index.mdx), with the key difference that the async client is designed to be used in an `async` function running in an [`asyncio` event loop](https://docs.python.org/3/library/asyncio-eventloop.html#asyncio-event-loop). +import AcademyAdmonition from '@site/src/components/AcademyAdmonition'; + + + ## Installation The async client is already included in the `weaviate-client` package. Follow the installation instructions in the [Python client library documentation](./index.mdx#installation). diff --git a/docs/weaviate/client-libraries/python/index.mdx b/docs/weaviate/client-libraries/python/index.mdx index b57c6514e..562ad05cd 100644 --- a/docs/weaviate/client-libraries/python/index.mdx +++ b/docs/weaviate/client-libraries/python/index.mdx @@ -389,6 +389,12 @@ import CodeExamples from "/_includes/clients/code-examples.mdx"; +import AcademyAdmonition from '@site/src/components/AcademyAdmonition'; + + + ## Questions and feedback import DocsFeedback from "/_includes/docs-feedback.mdx"; diff --git a/docs/weaviate/concepts/index.md b/docs/weaviate/concepts/index.md index 09e1792f4..59a5f5ba7 100644 --- a/docs/weaviate/concepts/index.md +++ b/docs/weaviate/concepts/index.md @@ -6,22 +6,20 @@ image: og/docs/concepts.jpg # tags: ['getting started'] --- +The **Concepts** section explains various aspects related to Weaviate and its architecture to help you get the most out of it. You can read these sections in any order. - +:::info Quickstart -The **Concepts** section explains various aspects related to Weaviate and its architecture to help you get the most out of it. You can read these sections in any order. +If you are after a practical guide with code examples, check out the [quickstart tutorial](/weaviate/quickstart/index.md). -:::info -If you are after a practical guide, try the [quickstart tutorial](/weaviate/quickstart/index.md). ::: +import AcademyAdmonition from '@site/src/components/AcademyAdmonition'; + + + ## Core concepts **[Data structure](./data.md)** diff --git a/docs/weaviate/concepts/vector-quantization.md b/docs/weaviate/concepts/vector-quantization.md index e125f8d84..6e9ba661f 100644 --- a/docs/weaviate/concepts/vector-quantization.md +++ b/docs/weaviate/concepts/vector-quantization.md @@ -223,8 +223,6 @@ In some cases, rescoring also includes over-fetching, whereby additional candida - [How to configure: Product quantization (compression)](../configuration/compression/pq-compression.md) - [How to configure: Scalar quantization (compression)](../configuration/compression/sq-compression.md) - [How to configure: Rotational quantization (compression)](../configuration/compression/rq-compression.md) -- [Weaviate Academy: 250 Vector Compression](../../academy/py/compression/index.md) - ::: ## Questions and feedback diff --git a/docs/weaviate/configuration/modules.md b/docs/weaviate/configuration/modules.md index 4fd6f9070..e5bd87953 100644 --- a/docs/weaviate/configuration/modules.md +++ b/docs/weaviate/configuration/modules.md @@ -45,10 +45,6 @@ services: ### Enable all API-based modules -:::caution Experimental feature -Available starting in `v1.26.0`. This is an experimental feature. Use with caution. -::: - You can enable all API-based modules by setting the `ENABLE_API_BASED_MODULES` variable to `true`. This will enable all API-based [model integrations](../model-providers/index.md), such as those for Anthropic, Cohere, OpenAI and so on by enabling the relevant modules. These modules are lightweight, so enabling them all will not significantly increase resource usage. ```yaml diff --git a/docs/weaviate/index.mdx b/docs/weaviate/index.mdx index 07a8d07c1..07c2f0af2 100644 --- a/docs/weaviate/index.mdx +++ b/docs/weaviate/index.mdx @@ -24,7 +24,19 @@ export const welcomeCardsData = [ ), link: "/weaviate/quickstart", - icon: "fas fa-star", // use Font Awesome CSS class + icon: "fas fa-star", + }, + { + id: "academy", + title: "Weaviate Academy", + description: ( + <> + Check out Weaviate Academy – a + learning platform entered around AI-native development. + + ), + link: "https://academy.weaviate.io/", + icon: "fa-solid fa-graduation-cap", }, ]; @@ -42,13 +54,14 @@ The Weaviate documentation is structured into multiple units based on the servic {/* Filter out items where isSmall is true */} {(() => { - const regularItems = Object.fromEntries( - Object.entries(secondaryNavbarItems).filter(([, value]) => !value.isSmall) - ); - -
- return ; -
+const regularItems = Object.fromEntries( +Object.entries(secondaryNavbarItems).filter(([, value]) => !value.isSmall) +); + +{" "} +
+return ; +
})()} ## What is Weaviate? @@ -64,6 +77,12 @@ Weaviate is an **open-source vector database** designed to store and index both - **[Agent-driven workflows](../agents/index.md)** Its flexible API and integration with modern AI models make Weaviate suitable for powering applications that rely on intelligent agents. These agents can leverage semantic insights to make decisions or trigger actions based on the data stored in Weaviate. +import AcademyAdmonition from '@site/src/components/AcademyAdmonition'; + + + ## The Weaviate Ecosystem The Weaviate ecosystem consists of multiple tools and services centered around building cloud-native AI-powered applications. diff --git a/docs/weaviate/manage-collections/tenant-states.mdx b/docs/weaviate/manage-collections/tenant-states.mdx index af3e93ae6..c465e7a76 100644 --- a/docs/weaviate/manage-collections/tenant-states.mdx +++ b/docs/weaviate/manage-collections/tenant-states.mdx @@ -29,9 +29,9 @@ In multi-tenant collections, you can change tenant states (`Active`, `Inactive`, The vector index type affects its default resource type. -* [`HNSW` index (default)](/academy/py/vector_index/hnsw) - uses the vector index in RAM, a **Hot** resource. -* [`Flat` index](/academy/py/vector_index/flat) - uses the vector index on disk, a **Warm** resource. -* [`Dynamic` index](/academy/py/vector_index/dynamic) - starts as a flat index (using a **Warm** resource), then switches to an HNSW index (a **Hot** resource) at a predetermined threshold. +* [`HNSW` index (default)](/weaviate/config-refs/indexing/vector-index.mdx#hnsw-index) - uses the vector index in RAM, a **Hot** resource. +* [`Flat` index](/weaviate/config-refs/indexing/vector-index.mdx#flat-index) - uses the vector index on disk, a **Warm** resource. +* [`Dynamic` index](/weaviate/config-refs/indexing/vector-index.mdx#dynamic-index) - starts as a flat index (using a **Warm** resource), then switches to an HNSW index (a **Hot** resource) at a predetermined threshold. diff --git a/docs/weaviate/model-providers/index.md b/docs/weaviate/model-providers/index.md index f81647f82..2e407d46e 100644 --- a/docs/weaviate/model-providers/index.md +++ b/docs/weaviate/model-providers/index.md @@ -8,6 +8,7 @@ image: og/docs/model-provider-integrations.jpg Weaviate integrates with a variety of [self-hosted](#locally-hosted) and [API-based](#api-based) models from a range of providers. This enables an enhanced developed experience, such as the ability to: + - Import objects directly into Weaviate without having to manually specify embeddings, and - Build an integrated retrieval augmented generation (RAG) pipeline with generative AI models. @@ -15,31 +16,25 @@ This enables an enhanced developed experience, such as the ability to: ### API-based -| Model provider | Embeddings | Generative AI | Others | -| --- | --- | --- | --- | -| [Anthropic](./anthropic/index.md) | - | [Text](./anthropic/generative.md) | - | -| [Anyscale](./anyscale/index.md) | - | [Text](./anyscale/generative.md) | - | -| [AWS](./aws/index.md) | [Text](./aws/embeddings.md) | [Text](./aws/generative.md) | -| [Cohere](./cohere/index.md) | [Text](./cohere/embeddings.md), [Multimodal](./cohere/embeddings-multimodal.md) | [Text](./cohere/generative.md) | [Reranker](./cohere/reranker.md) | -| [Databricks](./databricks/index.md) | [Text](./databricks/embeddings.md) | [Text](./databricks/generative.md) | - | -| [FriendliAI](./friendliai/index.md) | - | [Text](./friendliai/generative.md) | - | -| [Google](./google/index.md) | [Text](./google/embeddings.md), [Multimodal](./google/embeddings-multimodal.md) | [Text](./google/generative.md) | - | -| [Hugging Face](./huggingface/index.md) | [Text](./huggingface/embeddings.md) | - | - | -| [Jina AI](./jinaai/index.md) | [Text](./jinaai/embeddings.md), [Multimodal](./jinaai/embeddings-multimodal.md) | - | [Reranker](./jinaai/reranker.md) | -| [Mistral](./mistral/index.md) | [Text](./mistral/embeddings.md) | [Text](./mistral/generative.md) | - | -| [NVIDIA](./nvidia/index.md) | [Text](./nvidia/embeddings.md), [Multimodal](./nvidia/embeddings-multimodal.md) | [Text](./nvidia/generative.md) | [Reranker](./nvidia/reranker.md) | -| [OctoAI (Deprecated)](./octoai/index.md) | [Text](./octoai/embeddings.md) | [Text](./octoai/generative.md) | - | -| [OpenAI](./openai/index.md) | [Text](./openai/embeddings.md) | [Text](./openai/generative.md) | - | -| [Azure OpenAI](./openai-azure/index.md) | [Text](./openai-azure/embeddings.md) | [Text](./openai-azure/generative.md) | - | -| [Voyage AI](./voyageai/index.md) | [Text](./voyageai/embeddings.md), [Multimodal](./voyageai/embeddings-multimodal.md) | - | [Reranker](./voyageai/reranker.md) | -| [Weaviate](./weaviate/index.md) | [Text](./weaviate/embeddings.md) | - | - | -| [xAI](./xai/index.md) | - | [Text](./xai/generative.md) | - | - -#### Enable all API-based modules - -:::caution Experimental feature -Available starting in `v1.26.0`. This is an experimental feature. Use with caution. -::: +| Model provider | Embeddings | Generative AI | Others | +| ---------------------------------------- | ----------------------------------------------------------------------------------- | ------------------------------------ | ---------------------------------- | +| [Anthropic](./anthropic/index.md) | - | [Text](./anthropic/generative.md) | - | +| [Anyscale](./anyscale/index.md) | - | [Text](./anyscale/generative.md) | - | +| [AWS](./aws/index.md) | [Text](./aws/embeddings.md) | [Text](./aws/generative.md) | +| [Cohere](./cohere/index.md) | [Text](./cohere/embeddings.md), [Multimodal](./cohere/embeddings-multimodal.md) | [Text](./cohere/generative.md) | [Reranker](./cohere/reranker.md) | +| [Databricks](./databricks/index.md) | [Text](./databricks/embeddings.md) | [Text](./databricks/generative.md) | - | +| [FriendliAI](./friendliai/index.md) | - | [Text](./friendliai/generative.md) | - | +| [Google](./google/index.md) | [Text](./google/embeddings.md), [Multimodal](./google/embeddings-multimodal.md) | [Text](./google/generative.md) | - | +| [Hugging Face](./huggingface/index.md) | [Text](./huggingface/embeddings.md) | - | - | +| [Jina AI](./jinaai/index.md) | [Text](./jinaai/embeddings.md), [Multimodal](./jinaai/embeddings-multimodal.md) | - | [Reranker](./jinaai/reranker.md) | +| [Mistral](./mistral/index.md) | [Text](./mistral/embeddings.md) | [Text](./mistral/generative.md) | - | +| [NVIDIA](./nvidia/index.md) | [Text](./nvidia/embeddings.md), [Multimodal](./nvidia/embeddings-multimodal.md) | [Text](./nvidia/generative.md) | [Reranker](./nvidia/reranker.md) | +| [OctoAI (Deprecated)](./octoai/index.md) | [Text](./octoai/embeddings.md) | [Text](./octoai/generative.md) | - | +| [OpenAI](./openai/index.md) | [Text](./openai/embeddings.md) | [Text](./openai/generative.md) | - | +| [Azure OpenAI](./openai-azure/index.md) | [Text](./openai-azure/embeddings.md) | [Text](./openai-azure/generative.md) | - | +| [Voyage AI](./voyageai/index.md) | [Text](./voyageai/embeddings.md), [Multimodal](./voyageai/embeddings-multimodal.md) | - | [Reranker](./voyageai/reranker.md) | +| [Weaviate](./weaviate/index.md) | [Text](./weaviate/embeddings.md) | - | - | +| [xAI](./xai/index.md) | - | [Text](./xai/generative.md) | - | You can enable all API-based integrations at once by [by setting the `ENABLE_API_BASED_MODULES` environment variable to `true`](../configuration/modules.md#enable-all-api-based-modules). @@ -49,14 +44,20 @@ Read more about [enabling all API-based modules](../configuration/modules.md#ena ### Locally hosted -| Model provider | Embeddings | Generative AI | Others | -| --- | --- | --- | --- | -| [GPT4All (Deprecated)](./gpt4all/index.md) | [Text (Deprecated)](./gpt4all/embeddings.md) | - | - | -| [Hugging Face](./transformers/index.md) | [Text](./transformers/embeddings.md), [Multimodal (CLIP)](./transformers/embeddings-multimodal.md) | - | [Reranker](./transformers/reranker.md) | -| [KubeAI](./kubeai/index.md) | [Text](./kubeai/embeddings.md) | - | - | -| [Model2vec](./model2vec/index.md) | [Text](./model2vec/embeddings.md) | - | - | -| [Meta ImageBind](./imagebind/index.md) | [Multimodal](./imagebind/embeddings-multimodal.md) | - | - | -| [Ollama](./ollama/index.md) | [Text](./ollama/embeddings.md) | [Text](./ollama/generative.md) | - | +| Model provider | Embeddings | Generative AI | Others | +| ------------------------------------------ | -------------------------------------------------------------------------------------------------- | ------------------------------ | -------------------------------------- | +| [GPT4All (Deprecated)](./gpt4all/index.md) | [Text (Deprecated)](./gpt4all/embeddings.md) | - | - | +| [Hugging Face](./transformers/index.md) | [Text](./transformers/embeddings.md), [Multimodal (CLIP)](./transformers/embeddings-multimodal.md) | - | [Reranker](./transformers/reranker.md) | +| [KubeAI](./kubeai/index.md) | [Text](./kubeai/embeddings.md) | - | - | +| [Model2vec](./model2vec/index.md) | [Text](./model2vec/embeddings.md) | - | - | +| [Meta ImageBind](./imagebind/index.md) | [Multimodal](./imagebind/embeddings-multimodal.md) | - | - | +| [Ollama](./ollama/index.md) | [Text](./ollama/embeddings.md) | [Text](./ollama/generative.md) | - | + +import AcademyAdmonition from '@site/src/components/AcademyAdmonition'; + + ## How does Weaviate generate embeddings? @@ -129,6 +130,6 @@ The text is always lowercased for the `text2vec-contextionary` integration. ## Questions and feedback -import DocsFeedback from '/_includes/docs-feedback.mdx'; +import DocsFeedback from '/\_includes/docs-feedback.mdx'; diff --git a/docs/weaviate/more-resources/faq.md b/docs/weaviate/more-resources/faq.md index e4e976530..a57357e04 100644 --- a/docs/weaviate/more-resources/faq.md +++ b/docs/weaviate/more-resources/faq.md @@ -621,6 +621,21 @@ Keep in mind that we don't offer native Windows support at this time and deploym +#### Q: What is Weaviate Academy? + +
+ Answer + +Weaviate Academy is a full-fledged learning platform available at [academy.weaviate.io](https://academy.weaviate.io). + +:::note + +If you need resources from the previous version of Weaviate Academy, check out the [documentation archive](https://archive.docs.weaviate.io/academy) + +::: + +
+ ## Questions and feedback import DocsFeedback from '/_includes/docs-feedback.mdx'; diff --git a/docs/weaviate/quickstart/index.md b/docs/weaviate/quickstart/index.md index 9dcba7d30..b4237344d 100644 --- a/docs/weaviate/quickstart/index.md +++ b/docs/weaviate/quickstart/index.md @@ -483,7 +483,7 @@ Try these additional resources to learn more about Weaviate:

- Check out the Starter guide: retrieval augmented generation, and the Weaviate Academy unit on chunking. + Check out the Starter guide: retrieval augmented generation.

diff --git a/docs/weaviate/quickstart/local.md b/docs/weaviate/quickstart/local.md index 6c3b355d7..9167b1116 100644 --- a/docs/weaviate/quickstart/local.md +++ b/docs/weaviate/quickstart/local.md @@ -432,7 +432,7 @@ Try these additional resources to learn more about Weaviate:

- Check out the Starter guide: retrieval augmented generation, and the Weaviate Academy unit on chunking. + Check out the Starter guide: retrieval augmented generation.

diff --git a/docs/weaviate/search/bm25.md b/docs/weaviate/search/bm25.md index 43d2e44d5..b6a109af6 100644 --- a/docs/weaviate/search/bm25.md +++ b/docs/weaviate/search/bm25.md @@ -662,7 +662,6 @@ Set the tokenization method to `trigram` at the property level when creating you - [Connect to Weaviate](../connections/index.mdx) - [API References: Search operators # BM25](../api/graphql/search-operators.md#bm25) - [Reference: Tokenization options](../config-refs/collections.mdx#tokenization) -- [Weaviate Academy: Tokenization](../../academy/py/tokenization/index.md) ## Questions and feedback diff --git a/docs/weaviate/search/hybrid.md b/docs/weaviate/search/hybrid.md index c6911d485..59ac4094a 100644 --- a/docs/weaviate/search/hybrid.md +++ b/docs/weaviate/search/hybrid.md @@ -833,7 +833,6 @@ import TokenizationNote from '/_includes/tokenization.mdx' - [Connect to Weaviate](/weaviate/connections/index.mdx) - [API References: Search operators # Hybrid](../api/graphql/search-operators.md#hybrid) -- [Weaviate Academy: Tokenization](../../academy/py/tokenization/index.md) - About [hybrid fusion algorithms](https://weaviate.io/blog/hybrid-search-fusion-algorithms). - For tutorials, see [Queries](/weaviate/tutorials/query.md) - For search using the GraphQL API, see [GraphQL API](../api/graphql/get.md). diff --git a/docs/weaviate/starter-guides/managing-resources/indexing.mdx b/docs/weaviate/starter-guides/managing-resources/indexing.mdx index 0efb097f7..4dded2e7a 100644 --- a/docs/weaviate/starter-guides/managing-resources/indexing.mdx +++ b/docs/weaviate/starter-guides/managing-resources/indexing.mdx @@ -145,12 +145,6 @@ For more documentation details, see: - [Vector indexing](/weaviate/concepts/vector-index) - [Inverted indexes](/weaviate/concepts/indexing) -### Weaviate academy - -For a short course on vector indexes, see: - -- [Vector indexing](/academy/py/vector_index) - ## Questions and feedback import DocsFeedback from '/_includes/docs-feedback.mdx'; diff --git a/docusaurus.config.js b/docusaurus.config.js index 6d86cdd8e..b429ab226 100644 --- a/docusaurus.config.js +++ b/docusaurus.config.js @@ -64,7 +64,7 @@ const config = { "Comprehensive guides and references for Weaviate, the open-source vector database.", depth: 3, content: { - //excludeRoutes: ["/academy/**", "/contributor-guide/**"], // Throwing an error in GitHub Actions + //excludeRoutes: ["/contributor-guide/**"], // Throwing an error in GitHub Actions enableMarkdownFiles: false, }, //logLevel: 3, // Uncomment to enable debug logging @@ -116,7 +116,7 @@ const config = { image: "og/default.jpg", announcementBar: { id: "announcement-bar-september-2025", - content: `Product update: The Weaviate Query Agent has been released!`, + content: `The new Weaviate Academy learning platform is here!`, backgroundColor: "#1C1468", textColor: "#F5F5F5", isCloseable: true, @@ -135,9 +135,15 @@ const config = { to: "https://github.com/weaviate/weaviate", position: "right", }, + { + label: "Weaviate Academy", + className: "academy-button", + to: "https://academy.weaviate.io", + position: "right", + }, { label: "Weaviate Cloud", - className: "modern-button", + className: "cloud-button", to: "https://console.weaviate.cloud", position: "right", }, diff --git a/netlify.toml b/netlify.toml index 68e589175..29c47779d 100644 --- a/netlify.toml +++ b/netlify.toml @@ -763,3 +763,45 @@ status = 301 from = "/contributor-guide/contextionary/classification-benchmarks" to = "/contributor-guide" status = 301 + +## Academy redirects + +[[redirects]] +from = "/academy/theory/ai_models_deep_dive/*" +to = "https://academy.weaviate.io/courses/wa150-py" +status = 301 + +[[redirects]] +from = "/academy/theory/hello_weaviate/*" +to = "https://academy.weaviate.io/courses/wa050-py" +status = 301 + +[[redirects]] +from = "/academy/theory/embedding_model_selection/*" +to = "https://academy.weaviate.io/courses/wa150-py" +status = 301 + +[[redirects]] +from = "/academy/deployment/k8s/*" +to = "/deploy/installation-guides/k8s-installation" +status = 301 + +[[redirects]] +from = "/academy/js/set_up_typescript" +to = "/weaviate/client-libraries/typescript" +status = 301 + +[[redirects]] +from = "/academy/js/*" +to = "/weaviate/guides" +status = 301 + +[[redirects]] +from = "/academy/py/*" +to = "https://academy.weaviate.io/courses/wa101t-py" +status = 301 + +[[redirects]] +from = "/academy/*" +to = "https://academy.weaviate.io/" +status = 301 diff --git a/secondaryNavbar.js b/secondaryNavbar.js index 16d26f468..dc1ec031d 100644 --- a/secondaryNavbar.js +++ b/secondaryNavbar.js @@ -109,21 +109,6 @@ const secondaryNavbarItems = { }, ], }, - academy: { - title: "Academy", - icon: "fa fa-graduation-cap", - isSmall: true, - description: - "Learn about vector search and Weaviate through structured courses", - link: "/academy", - links: [ - { - label: "Get Started", - link: "/academy", - sidebar: "academySidebar", - }, - ], - }, integrations: { title: "Integrations", icon: "fa fa-puzzle-piece", @@ -165,6 +150,14 @@ const secondaryNavbarItems = { href: "https://weaviate.io/community/events", links: [], }, + academy: { + title: "Weaviate Academy", + icon: "fa-solid fa-graduation-cap", + isSmall: true, + description: "", + href: "https://academy.weaviate.io/", + links: [], + }, }; export default secondaryNavbarItems; diff --git a/sidebars.js b/sidebars.js index 446e42443..6e3c4341a 100644 --- a/sidebars.js +++ b/sidebars.js @@ -1137,12 +1137,6 @@ const sidebars = { ], }, ], - academySidebar: [ - { - type: "autogenerated", - dirName: "academy", - }, - ], contributorSidebar: [ { type: "category", diff --git a/src/components/Academy/academy.css b/src/components/Academy/academy.css deleted file mode 100644 index 124f3ea5f..000000000 --- a/src/components/Academy/academy.css +++ /dev/null @@ -1,112 +0,0 @@ -.__academyhero { - background-color: #00152b; - border-radius: 10px; - border-style: solid; - border-color: gray; - padding: 0; -} - -.__academycard { - background-color: transparent; - border-color: #808080; - border-style: solid; - margin-bottom: 15px; -} - -.__academy_cardgroup { - margin-bottom: 10px; -} - -.learning_goal_head { - padding-bottom: 15px; -} - -.quiz { - width: 90%; - min-height: 200px; - height: min-content; - border-radius: 15px; - padding: 20px; - margin-bottom: 20px; - display: flex; - border: 2px solid #6b9fd6; - justify-content: space-evenly; -} - -.feedback-section { - width: 100%; - position: relative; - align-items: center; -} - -/* QUESTION/TIMER/LEFT SECTION */ -.question-section { - width: 100%; - position: relative; -} - -.question-count { - margin-bottom: 20px; -} - -.question-count span { - font-size: 28px; -} - -.question-count { - margin-bottom: 20px; - font-size: 1.4rem; - font-weight: bold; -} - -.question-text { - margin-bottom: 12px; - margin-right: 10px; - white-space: pre-line; - font-size: 1.1rem; - line-height: 1.4; -} - -/* ANSWERS/RIGHT SECTION */ -.answer-section { - width: 100%; - display: flex; - flex-direction: column; - justify-content: space-between; -} - -.quiz-button { - width: 100%; - font-size: 16px; - border-radius: 5px; - display: flex; - padding: 5px; - margin: 5px; - justify-content: center; - align-items: center; - border: 2px solid #6b9fd6; - cursor: pointer; - transition: background-color 0.3s, color 0.3s; -} - -.code_answer { - text-align: left; - white-space: pre-wrap; -} - -.correct { - background-color: #2f922f; -} - -.incorrect { - background-color: #ff3333; -} - -.quiz-button:hover { - background-color: #2c2c2c; - color: #ffffff; -} - -.quiz-button:focus { - outline: none; -} \ No newline at end of file diff --git a/src/components/Academy/card.jsx b/src/components/Academy/card.jsx deleted file mode 100644 index 05ad676af..000000000 --- a/src/components/Academy/card.jsx +++ /dev/null @@ -1,71 +0,0 @@ -import React from 'react'; -import Link from '@docusaurus/Link'; -import './academy.css' - -function AcademyCard(props) { - - let badgeClass = "" - let badgeTxt = "" - let btnClass = "" - let btnTxt = "" - let btnURL = "" - let btnComp = "" - let note = "" - - if (props.badgeType == "theory") { - badgeClass = "badge badge--success"; - badgeTxt = "Theory"; - } else if (props.badgeType == "practical") { - badgeClass = "badge badge--info"; - badgeTxt = "Practical"; - } else if (props.badgeType == "course") { - badgeClass = "badge badge--primary"; - badgeTxt = "Course"; - } else if (props.badgeType == "mixed") { - badgeClass = "badge badge--warning"; - badgeTxt = "Mixed"; - } else { - badgeClass = "badge badge--secondary"; - badgeTxt = "Other"; - }; - - if (props.buttonType == "Notify") { - btnClass = "button button--outline button--secondary button--block"; - btnTxt = "Notify me when ready"; - } else if (props.buttonType == "TBD") { - btnClass = "button button--outline button--secondary button--block"; - btnTxt = "Coming soon"; - } else { - btnClass = "button button--primary button--block"; - btnTxt = props.buttonType; - }; - - if (props.buttonURL == null) { - btnComp = - } else { - btnURL = props.buttonURL - btnComp = {btnTxt} - } - - // const badgeClass = "badge badge--note"; - // const badgeTxt = props.badgeType; - - return ( -
-
-

{props.title}

-
-
- {badgeTxt} {props.note != null? {props.note}: null } -

- {props.body} -

-
- -
- ) -}; - -export default AcademyCard; \ No newline at end of file diff --git a/src/components/Academy/courseData.js b/src/components/Academy/courseData.js deleted file mode 100644 index 102790212..000000000 --- a/src/components/Academy/courseData.js +++ /dev/null @@ -1,366 +0,0 @@ -export const courseData = { - "starter_text_data": { - title: "Text data with Weaviate", - courseId: "PY_101T", - body: "Project-based learning where you'll learn how to build with Weaviate and any text data. Weaviate generates the vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_text_data", - badgeType: "course", - isCourse: true, - units: [ - "text_setup_weaviate", "text_collections", "text_searches", "text_rag" - ], - learningGoals: [ - "How to create a Weaviate instance, add data to it to enable semantic searching, and use AI through retrieval augmented generation." - ], - learningOutcomes: [ - "Create an instance of Weaviate for you to use", - "Produce, store and index semantic (vector) data from source text", - "Perform semantic, keyword and hybrid searches", - "Use AI (large language models) to augment and transform retrieved data", - ], - note: "Python client (v4); project-based" - }, - "starter_custom_vectors": { - title: "Your own vectors with Weaviate", - courseId: "PY_101V", - body: "Project-based learning where you'll learn how to build with Weaviate and your own data and vectors. This version is for those who prefer to use your own vectors built outside of Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_custom_vectors", - badgeType: "course", - isCourse: true, - units: [ - "byov_setup_weaviate", "byov_collections", "byov_searches", "byov_rag" - ], - learningGoals: [ - "How to create a cloud Weaviate instance, add data to it to enable semantic searching, and use AI through retrieval augmented generation." - ], - learningOutcomes: [ - "Create a instance of Weaviate for you to use", - "Produce, store and index data with your own vectors", - "Perform vector, keyword and hybrid searches", - "Use AI (large language models) to augment and transform retrieved data", - ], - note: "Python client (v4); project-based" - }, - "starter_multimodal": { - title: "Multimodal data with Weaviate", - courseId: "PY_101M", - body: "Project-based learning where you'll learn how to build with Weaviate and multi-modal data. Weaviate generates the vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_multimodal_data", - badgeType: "course", - isCourse: true, - units: [ - "docker_mm_basics", "mm_collections", "mm_searches", "mm_rag" - ], - learningGoals: [ - "How to create a local Weaviate instance, add data to it to enable multi-modal searching, and use AI through retrieval augmented generation." - ], - learningOutcomes: [ - "Create a local instance of Weaviate with a multimodal vectorizer module", - "Produce, store and index multimodal data", - "Perform multimodal searches", - "Use AI (large language models) to augment and transform retrieved data", - ], - note: "Python client (v4); project-based" - }, - "named_vectors": { - title: "Flexible data representation: Named vectors", - courseId: "PY_220", - body: "Learn how named vectors can provide a flexible way to represent your data in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/py/named_vectors", - badgeType: "course", - isCourse: true, - units: [ - - ], - learningGoals: [ - "What named vectors can be used for, and how to add them to your data collection." - ], - learningOutcomes: [ - "Describe use cases for named vectors", - "Create a collection with named vectors", - "Add objects with multiple vector embeddings per object", - "Perform searches on named vectors", - ], - note: "Python client (v4); project-based" - }, - "vector_index": { - title: "Vector indexes", - courseId: "PY_230", - body: "Vector indexes are the key components for vector search. Learn what they are, and how to use them effectively to suit your needs.", - buttonType: "Click here", - buttonURL: "/academy/py/vector_index", - badgeType: "course", - isCourse: true, - units: [ - "vindex_overview", "vindex_hnsw", "vindex_flat", "vindex_dynamic" - ], - learningGoals: [ - "What vector index types are available, when to select each one and how to configure them." - ], - learningOutcomes: [ - "Name available vector index types in Weaviate.", - "Select an appropriate index type for a given use case.", - "Recite relationships between HNSW parameters and search performance.", - "Describe how quantization affects each index type.", - "Create collections with your chosen vector index type and preferred parameters.", - ], - note: "Python client (v4)" - }, - "compression": { - title: "Vector compression for improved efficiency", - courseId: "PY_250", - body: "Vectors can be compressed to reduce memory requirements or improve retrieval speeds. Find out how to get the most out of this feature.", - buttonType: "Click here", - buttonURL: "/academy/py/compression", - badgeType: "course", - isCourse: true, - units: [ - "compression_pq", "compression_bq", "compression_strategy" - ], - learningGoals: [ - "What vector compression algorithms are available, how to use them and when to use them." - ], - learningOutcomes: [ - "Name available vector compression algorithms in Weaviate.", - "Create collections with vector compression enabled.", - "Configure vector compression parameters.", - "Select a compression algorithm for a given use case.", - ], - note: "Python client (v4)" - }, - "tokenization": { - title: "Text tokenization", - courseId: "PY_275", - body: "What happens when text is indexed, and searched, or converted into a vector? They are 'tokenized'. Learn what this is, and how you can make it work for you.", - buttonType: "Click here", - buttonURL: "/academy/py/tokenization", - badgeType: "course", - isCourse: true, - units: [ - "tokenization_basics", "tokenization_options", "tokenization_filters", "tokenization_searches" - ], - learningGoals: [ - "What tokenization is, and why it is required." - ], - learningOutcomes: [ - "Identify tokenized text from raw text.", - "Name different tokenization options in Weaviate.", - "Select an appropriate tokenization option for a given use case.", - "Name languages for which specific tokenization options are available.", - ], - note: "Python client (v4)" - }, - "setup_weaviate_typescript": { - title: "Set up TypeScript (or Javascript) for Weaviate", - courseId: "TS_100", - body: "A quick run through of how to set up and install the Weaviate TypeScript client.", - buttonType: "Click here", - buttonURL: "/academy/js/set_up_typescript", - badgeType: "course", - isCourse: true, - units: [ - "setup_weaviate_typescript" - ], - learningGoals: [ - "Setup Weaviate to get started building TypeScript (or JavaScript) apps." - ], - learningOutcomes: [ - "Install Node.js", - "(Optionally) Install and set up TypeScript", - "Install the Weaviate client", - ], - note: "TS clients" - }, - "starter_multimodal_typescript": { - title: "Multimodal data with Weaviate", - courseId: "TS_101M", - body: "Project-based learning where you'll learn how to build with Weaviate and multi-modal data. Weaviate generates the vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_multimodal_data", - badgeType: "course", - isCourse: true, - units: [ - "docker_mm_basics_ts", "mm_collections_ts", "mm_searches_ts", "mm_rag_ts" - ], - learningGoals: [ - "How to create a Weaviate instance, add data to it to enable multi-modal searching, and use AI through retrieval augmented generation." - ], - learningOutcomes: [ - "Create an instance of Weaviate with a multimodal vectorizer module", - "Produce, store and index multimodal data", - "Perform multimodal searches", - "Use AI (large language models) to augment and transform retrieved data", - ], - note: "TS clients; project-based" - }, - "multi-tenancy": { - title: "Multi-tenancy", - courseId: "PY_280", - body: "Learn how to implement and manage multi-tenancy in Weaviate for efficient data isolation and resource management.", - buttonType: "Click here", - buttonURL: "/academy/py/multitenancy", - badgeType: "course", - isCourse: true, - units: [ - "mt_overview", "mt_setup", "mt_tenant_data", "mt_manage_tenants" - ], - learningGoals: [ - "Understand multi-tenancy concepts and their application in Weaviate", - "Learn how to set up and manage multi-tenant collections for scalable applications", - "Master techniques for efficient resource management in multi-tenant environments" - ], - learningOutcomes: [ - "Explain the concept of multi-tenancy and its benefits in Weaviate", - "Set up a Weaviate instance and configure collections for multi-tenancy", - "Create, manage, and remove tenants in a multi-tenant collection", - "Perform data operations and queries specific to individual tenants", - "Implement efficient resource management using tenant activity statuses", - "Utilize advanced features like auto-tenant creation, activation, and offloading", - "Apply multi-tenancy concepts to real-world scenarios for improved scalability and data isolation" - ], - note: "Python client (v4)" - }, - "starter_text_data_typescript": { - title: "Text data with Weaviate", - courseId: "TS_101T", - body: "Project-based learning where you'll learn how to build with Weaviate and any text data. Weaviate generates the vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_text_data", - badgeType: "course", - isCourse: true, - units: [ - "text_setup_weaviate_ts", "text_collections_ts", "text_searches_ts", "text_rag_ts" - ], - learningGoals: [ - "How to create a Weaviate instance, add data to it to enable semantic searching, and use AI through retrieval augmented generation." - ], - learningOutcomes: [ - "Create an instance of Weaviate for you to use", - "Produce, store and index semantic (vector) data from source text", - "Perform semantic, keyword and hybrid searches", - "Use AI (large language models) to augment and transform retrieved data", - ], - note: "TS clients; project-based" - }, - // "building_with_weaviate": { - // title: "Additional topics", - // courseId: "PY_200", - // body: "Expand on the `Getting to MVP` course for deeper dives into key topics vectorizer selection, multi-modal models, and best practices.", - // buttonType: "Click here", - // buttonURL: "/academy/py/building_with_weaviate", - // badgeType: "course", - // isCourse: true, - // units: [ - // "which_search", - // "schema_design", - // "vectorizer_selection", - // "indexing", - // ], - // learningGoals: [ - // "In-depth material and best practices to help you build with Weaviate, such as vectorization options, which searches to perform and how to work with your indexes." - // ], - // learningOutcomes: [ - // "Select a suitable vectorizer for your given goals and situation.", - // "Understand practical differences between search methods and suggest a suitable technique for a given situation.", - // "Compare types of indexes used by Weaviate, and modify parameters to balance speed and recall.", - // ] - // }, - // "configuring_weaviate_instance": { - // title: "Customization using modules", - // courseId: "2", - // body: "", - // buttonType: "Notify", - // badgeType: "course", - // isCourse: true, - // units: [ - // "t2v_under_hood", "vectorizer_selection_2", "custom_models", "module_building" - // ], - // learningGoals: [ - // "TBC" - // ], - // learningOutcomes: [ - // "TBC" - // ] - // }, - // "to_production": { - // title: "Getting to Production", - // courseId: "3", - // body: "Speed to production with authentication & authorization, backups, monitoring and replication.", - // buttonType: "Notify", - // badgeType: "course", - // isCourse: true, - // units: [ - // "backups", "auth", "scaling", "replication", "migration", "kubernetes" - // ], - // learningGoals: [ - // "TBC" - // ], - // learningOutcomes: [ - // "TBC" - // ] - // }, - "kubernetes_intro": { - title: "Run Weaviate on Kubernetes", - courseId: "D100", - body: "Learn how to run Weaviate on a local kubernetes cluster with Minikube.", - buttonType: "Click here", - buttonURL: "/academy/deployment/k8s", - badgeType: "course", - isCourse: false, - units: [ - "kubernetes_intro" - ] - }, - "standalone": { - title: "Standalone units", - courseId: "0", - body: "Bite-sized, standalone units that can be reviewed by themselves.", - buttonType: "Notify", - badgeType: "course", - isCourse: false, - units: [ - "which_search", - "chunking" - ] - }, - "standalone_js": { - title: "Standalone units (JS/TS)", - courseId: "0", - body: "Bite-sized, standalone units that can be reviewed by themselves.", - buttonType: "Notify", - badgeType: "course", - isCourse: false, - units: [ - "which_search", - "client_server", - "using_ml_models", - ] - }, - "zero_to_mvp": { - title: "Zero to MVP: The basics", - courseId: "P3_1", - body: "Start here: Get started with all the core knowledge and essential skills for building with Weaviate. Learn how to build a Weaviate database and effectively perform queries to find the right data.", - buttonType: "Click here", - buttonURL: "/academy/py/zero_to_mvp", - badgeType: "course", - isCourse: true, - units: [ - "hello_weaviate", "queries_1", "schema_and_import", "queries_2" - ], - learningGoals: [ - "How to build a Weaviate instance and populate it with vectorized data, as well as how to construct queries to efficiently retrieve relevant data." - ], - learningOutcomes: [ - "Use Weaviate Cloud to create an instance of Weaviate", - "Use appropriate query types and syntax to retrieve desired objects", - "Outline what vector search is and how it works", - "Demonstrate how to efficiently populate a Weaviate instance with data", - "Differentiate BM25 and hybrid search techniques from vector search techniques", - ], - note: "Python client (v3)" - } -}; diff --git a/src/components/Academy/courseUnits.jsx b/src/components/Academy/courseUnits.jsx deleted file mode 100644 index 40366705d..000000000 --- a/src/components/Academy/courseUnits.jsx +++ /dev/null @@ -1,29 +0,0 @@ -import React from "react"; -import UnitCardSet from "./unitcards"; -import { unitData } from '/src/components/Academy/unitData.js' -import './academy.css' - -function Units(props) { - - let cardData = props.courseData; - let courseName = props.courseName; - let cardItems = []; - - for (let k in cardData) { - if (k == courseName || courseName == null) { - let units = cardData[k].units.map(d => unitData[d]) - - cardItems.push( -
-
- -
-
- ) - } - }; - - return (
{cardItems}
) -} - -export default Units; \ No newline at end of file diff --git a/src/components/Academy/coursecards.jsx b/src/components/Academy/coursecards.jsx deleted file mode 100644 index 8b2803f88..000000000 --- a/src/components/Academy/coursecards.jsx +++ /dev/null @@ -1,30 +0,0 @@ -import React from 'react'; -import AcademyCard from './card'; -import './academy.css' - -function CourseCardSet(props) { - - let cardData = props.cardData; - let cardItems = []; - - for (let k in cardData) { - if (cardData[k].isCourse) { - cardItems.push( -
- -
- ) - } - }; - - return (
{cardItems}
) -} - -export default CourseCardSet; diff --git a/src/components/Academy/courses.jsx b/src/components/Academy/courses.jsx deleted file mode 100644 index 698fc775e..000000000 --- a/src/components/Academy/courses.jsx +++ /dev/null @@ -1,14 +0,0 @@ -import React from 'react'; -import CourseCardSet from './coursecards'; -import './academy.css' - -function Courses(props) { - - return ( -
- -
- ) -}; - -export default Courses; \ No newline at end of file diff --git a/src/components/Academy/learningGoals.jsx b/src/components/Academy/learningGoals.jsx deleted file mode 100644 index 95e733bdc..000000000 --- a/src/components/Academy/learningGoals.jsx +++ /dev/null @@ -1,53 +0,0 @@ -import React from 'react'; -import { unitData } from '/src/components/Academy/unitData.js'; -import { courseData } from '/src/components/Academy/courseData.js'; -import './academy.css' - -function LearningGoals(props) { - - let data = "" - if (props.unitName !== undefined) { - data = unitData[props.unitName] - } else if (props.courseName !== undefined) { - data = courseData[props.courseName] - } - - let goalsArray = []; - let outcomesArray = []; - - for (let i = 0; i < data.learningGoals.length; i++) { - goalsArray.push( -
  • - {data.learningGoals[i]} -
  • - ); - }; - for (let i = 0; i < data.learningOutcomes.length; i++) { - outcomesArray.push( -
  • - {data.learningOutcomes[i]} -
  • - ) - }; - - return ( -
    -
    -
    -

      Here, we will cover:

    - Learning Goals -
    - -
      {goalsArray}
    -
    -
    -

      By the time you are finished, you will be able to:

    - Learning Outcomes -
    -
      {outcomesArray}
    -
    -
    - ) -}; - -export default LearningGoals; diff --git a/src/components/Academy/learningGoalsExp.mdx b/src/components/Academy/learningGoalsExp.mdx deleted file mode 100644 index 1fec4e800..000000000 --- a/src/components/Academy/learningGoalsExp.mdx +++ /dev/null @@ -1,8 +0,0 @@ -
    -   What are these? -   Each unit includes a set of Learning Goals and Learning Outcomes which form the unit's guiding principles. -
      -
    • Learning Goals describe the unit's key topics and ideas.
    • -
    • Learning Outcomes on the other hand describe tangible skills that the learner should be able to demonstrate
    • -
    -
    diff --git a/src/components/Academy/quiz.js b/src/components/Academy/quiz.js deleted file mode 100644 index e3bb4d8c4..000000000 --- a/src/components/Academy/quiz.js +++ /dev/null @@ -1,70 +0,0 @@ -import React, { useState } from 'react'; -import './academy.css' - -export default function Quiz(props) { - - const questions = props.questions - - const [currentQuestion, setCurrentQuestion] = useState(0); - const [showFeedback, setShowFeedback] = useState(false); - const [feedback, setFeedback] = useState(""); - const [score, setScore] = useState(0); - - const handleAnswerOptionClick = (isCorrect, feedback) => { - if (isCorrect) { - setScore(score + 1); - setFeedback("That's right!\n\n" + feedback); - } else { - setFeedback("That's not right.\n\n" + feedback); - } - - const nextQuestion = currentQuestion + 1; - if (nextQuestion < questions.length) { - setCurrentQuestion(nextQuestion); - } else { - setShowFeedback(true); - } - }; - - const handleRetryClick = () => { - setScore(0) - setShowFeedback(false); - }; - - let btnClass = "quiz-button" - if (props.isCode == true) { - btnClass = "quiz-button code_answer" - } - - return ( -
    - {showFeedback ? ( -
    -
    -

      Feedback

    -
    {feedback}
    -
    -
    - -
    -
    - ) : ( - <> -
    -
    -   Question -
    -
    {questions[currentQuestion].questionText}
    -
    -
    - {questions[currentQuestion].answerOptions.map((answerOption) => ( -
    - -
    - ))} -
    - - )} -
    - ); -} \ No newline at end of file diff --git a/src/components/Academy/unitData.js b/src/components/Academy/unitData.js deleted file mode 100644 index 7bdf10b85..000000000 --- a/src/components/Academy/unitData.js +++ /dev/null @@ -1,1189 +0,0 @@ -export const unitData = { - "text_setup_weaviate": { - title: "Set up Weaviate", - body: "Set up a Weaviate instance and connect to it.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_text_data/setup_weaviate", - badgeType: "practical", - learningGoals: [ - "Set up a Weaviate instance and connect to it with the Python client." - ], - learningOutcomes: [ - "Install the latest Weaviate Python client.", - "Create a Weaviate instance.", - "Connect to the Weaviate instance using the Python client and communicate with it." - ] - }, - "text_setup_weaviate_ts": { - title: "Set up Weaviate", - body: "Set up a Weaviate instance and connect to it.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_text_data/setup_weaviate", - badgeType: "practical", - learningGoals: [ - "Set up a Weaviate instance and connect to it with the JavaScript/Typescript client." - ], - learningOutcomes: [ - "Install the latest Weaviate JavaScript/Typescript client.", - "Create a Weaviate instance.", - "Connect to the Weaviate instance using the JavaScript/Typescript client and communicate with it." - ] - }, - "text_collections": { - title: "Populate the database", - body: "Create a collection and import data, and have Weaviate create vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_text_data/text_collections", - badgeType: "practical", - learningGoals: [ - "Configure and create a collection then import text data using batch imports." - ], - learningOutcomes: [ - "Configure a collection with typical settings and vectorizer set.", - "Create a collection and work with a collection object.", - "Import data using batch imports." - ] - }, - "text_collections_ts": { - title: "Populate the database", - body: "Create a collection and import data, and have Weaviate create vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_text_data/text_collections", - badgeType: "practical", - learningGoals: [ - "Configure and create a collection then import text data using batch imports." - ], - learningOutcomes: [ - "Configure a collection with typical settings and vectorizer set.", - "Create a collection and work with a collection object.", - "Import data using batch imports." - ] - }, - "text_searches": { - title: "Perform searches", - body: "Learn how to use search functions in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_text_data/text_searches", - badgeType: "practical", - learningGoals: [ - "Perform semantic, keyword and hybrid searches." - ], - learningOutcomes: [ - "Describe differences between semantic, keyword and hybrid searches at a high level.", - "Perform a semantic search with near text functions.", - "Perform a keyword search.", - "Perform a hybrid search.", - ] - }, - "text_searches_ts": { - title: "Perform searches", - body: "Learn how to use search functions in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_text_data/text_searches", - badgeType: "practical", - learningGoals: [ - "Perform semantic, keyword and hybrid searches." - ], - learningOutcomes: [ - "Describe differences between semantic, keyword and hybrid searches at a high level.", - "Perform a semantic search with near text functions.", - "Perform a keyword search.", - "Perform a hybrid search.", - ] - }, - "text_rag": { - title: "LLMs and Weaviate (RAG)", - body: "Use large language models to augment and transform retrieved data.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_text_data/text_rag", - badgeType: "practical", - learningGoals: [ - "Use AI (large language models) to augment and transform retrieved data." - ], - learningOutcomes: [ - "Describe what RAG is and how it works at a high level.", - "Perform RAG on individual objects.", - "Perform RAG on the entire set of returned objects.", - ] - }, - "text_rag_ts": { - title: "LLMs and Weaviate (RAG)", - body: "Use large language models to augment and transform retrieved data.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_text_data/text_rag", - badgeType: "practical", - learningGoals: [ - "Use AI (large language models) to augment and transform retrieved data." - ], - learningOutcomes: [ - "Describe what RAG is and how it works at a high level.", - "Perform RAG on individual objects.", - "Perform RAG on the entire set of returned objects.", - ] - }, - "byov_setup_weaviate": { - title: "Set up Weaviate", - body: "Set up a Weaviate instance and connect to it.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_custom_vectors/setup_weaviate", - badgeType: "practical", - learningGoals: [ - "Set up a Weaviate instance and connect to it with the Python client." - ], - learningOutcomes: [ - "Install the latest Weaviate Python client.", - "Create a Weaviate instance.", - "Connect to the Weaviate instance using the Python client and communicate with it." - ] - }, - "byov_collections": { - title: "Populate the database", - body: "Create a collection and import data with your own vectors.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_custom_vectors/object_collections", - badgeType: "practical", - learningGoals: [ - "Configure and create a collection then import text data with your own vectors using batch imports." - ], - learningOutcomes: [ - "Configure a collection with typical settings and vectorizer set as none.", - "Create a collection and work with a collection object.", - "Import data using batch imports." - ] - }, - "byov_searches": { - title: "Perform searches", - body: "Learn how to use search functions in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_custom_vectors/object_searches", - badgeType: "practical", - learningGoals: [ - "Perform semantic, keyword and hybrid searches." - ], - learningOutcomes: [ - "Describe differences between vector, keyword and hybrid searches at a high level.", - "Perform a vector search with near vector functions.", - "Perform a keyword search.", - "Perform a hybrid search.", - ] - }, - "byov_rag": { - title: "LLMs and Weaviate (RAG)", - body: "Use large language models to augment and transform retrieved data.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_custom_vectors/object_rag", - badgeType: "practical", - learningGoals: [ - "Use AI (large language models) to augment and transform retrieved data." - ], - learningOutcomes: [ - "Describe what RAG is and how it works at a high level.", - "Perform RAG on individual objects.", - "Perform RAG on the entire set of returned objects.", - ] - }, - "docker_mm_basics": { - title: "Weaviate for multimodal data", - body: "Create a local Weaviate instance for multimodal data.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_multimodal_data/setup_weaviate", - badgeType: "practical", - learningGoals: [ - "Set up a local Weaviate instance with a multimodal module." - ], - learningOutcomes: [ - "Create a local Weaviate instance with a multimodal module using Docker.", - "Install the latest Weaviate Python client.", - "Connect to the Weaviate instance using the Python client." - ] - }, - "docker_mm_basics_ts": { - title: "Weaviate for multimodal data", - body: "Create a Weaviate instance for multimodal data.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_multimodal_data/setup_weaviate", - badgeType: "practical", - learningGoals: [ - "Set up a Weaviate instance with a multimodal module." - ], - learningOutcomes: [ - "Create a Weaviate instance with a multimodal module using Docker.", - "Install the latest Weaviate TypeScript client.", - "Connect to the Weaviate instance using the Typescript client." - ] - }, - "mm_collections": { - title: "Populate the database", - body: "Create a collection and import multimodal data, and have Weaviate create vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_multimodal_data/mm_collections", - badgeType: "practical", - learningGoals: [ - "Configure and create a collection then import multimodal data using batch imports." - ], - learningOutcomes: [ - "Configure a collection with typical settings and vectorizer set.", - "Create a collection and work with a collection object.", - "Import data using batch imports." - ] - }, - "mm_collections_ts": { - title: "Populate the database", - body: "Create a collection and import multimodal data, and have Weaviate create vectors for you.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_multimodal_data/mm_collections", - badgeType: "practical", - learningGoals: [ - "Configure and create a collection then import multimodal data using batch imports." - ], - learningOutcomes: [ - "Configure a collection with typical settings and vectorizer set.", - "Create a collection and work with a collection object.", - "Import data using batch imports." - ] - }, - "mm_searches": { - title: "Perform searches", - body: "Learn how to use search functions in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_multimodal_data/mm_searches", - badgeType: "practical", - learningGoals: [ - "Perform multimodal searches." - ], - learningOutcomes: [ - "Describe how multimodal search works at a high level.", - "Perform searches through multimodal data in different ways.", - ] - }, - "mm_searches_ts": { - title: "Perform searches", - body: "Learn how to use search functions in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_multimodal_data/mm_searches", - badgeType: "practical", - learningGoals: [ - "Perform multimodal searches." - ], - learningOutcomes: [ - "Describe how multimodal search works at a high level.", - "Perform searches through multimodal data in different ways.", - ] - }, - "mm_rag": { - title: "LLMs and Weaviate (RAG)", - body: "Use large language models to augment and transform retrieved data.", - buttonType: "Click here", - buttonURL: "/academy/py/starter_multimodal_data/mm_rag", - badgeType: "practical", - learningGoals: [ - "Use AI (large language models) to augment and transform retrieved data." - ], - learningOutcomes: [ - "Describe what RAG is and how it works at a high level.", - "Perform RAG on individual objects.", - "Perform RAG on the entire set of returned objects.", - ] - }, - "mm_rag_ts": { - title: "LLMs and Weaviate (RAG)", - body: "Use large language models to augment and transform retrieved data.", - buttonType: "Click here", - buttonURL: "/academy/js/starter_multimodal_data/mm_rag", - badgeType: "practical", - learningGoals: [ - "Use AI (large language models) to augment and transform retrieved data." - ], - learningOutcomes: [ - "Describe what RAG is and how it works at a high level.", - "Perform RAG on individual objects.", - "Perform RAG on the entire set of returned objects.", - ] - }, - "nv_collections": { - title: "Populate the database", - body: "Create a collection with multiple named vectors and import data objects.", - buttonType: "Click here", - buttonURL: "/academy/py/named_vectors/nv_collections", - badgeType: "practical", - learningGoals: [ - "Configure and create a collection with multiple named vectors then import data using batch imports." - ], - learningOutcomes: [ - "Configure a collection with named vectors.", - "Import data using batch imports to create objects with multiple named vectors." - ] - }, - "nv_queries": { - title: "Perform queries", - body: "Learn how to use query functions in Weaviate with named vectors.", - buttonType: "Click here", - buttonURL: "/academy/py/named_vectors/nv_queries", - badgeType: "practical", - learningGoals: [ - "How queries change for collections with named vectors." - ], - learningOutcomes: [ - "Describe what query types are affected by named vectors.", - "Perform vector/similarity and hybrid searches in collections with named vectors.", - "Perform RAG on collections with named vectors.", - ] - }, - "compression_pq": { - title: "Product quantization", - body: "What is product quantization (PQ), and how do you use it?", - buttonType: "Click here", - buttonURL: "/academy/py/compression/pq", - badgeType: "practical", - learningGoals: [ - "Learn what PQ is and how to use it." - ], - learningOutcomes: [ - "Describe how PQ works and its key parameters.", - "Configure a collection with PQ.", - ] - }, - "compression_bq": { - title: "Binary quantization", - body: "What is binary quantization (BQ), and how do you use it?", - buttonType: "Click here", - buttonURL: "/academy/py/compression/bq", - badgeType: "practical", - learningGoals: [ - "Learn what BQ is and how to use it." - ], - learningOutcomes: [ - "Describe how BQ works and its key parameters.", - "Configure a collection with BQ.", - ] - }, - "compression_strategy": { - title: "Compression strategy", - body: "What compression algorithm and settings are right for me?", - buttonType: "Click here", - buttonURL: "/academy/py/compression/strategy", - badgeType: "theory", - learningGoals: [ - "Learn what compression algorithm might work best for your use case." - ], - learningOutcomes: [ - "Describe the pros and cons of using PQ, BQ, or uncompressed vectors?", - "Make an informed choice on which compression algorithm to use for a given use case.", - ] - }, - "vindex_overview": { - title: "Vector index: Overview", - body: "What is a vector index, and why is it important?", - buttonType: "Click here", - buttonURL: "/academy/py/vector_index/overview", - badgeType: "practical", - learningGoals: [ - "Learn what a vector index is and how it affects your vector searches." - ], - learningOutcomes: [ - "Describe what a vector index is and what it does.", - "Recite different types of vector indexes in Weaviate.", - ] - }, - "vindex_hnsw": { - title: "HNSW index in depth", - body: "Learn about the HNSW index type, and how to tune it for your use case.", - buttonType: "Click here", - buttonURL: "/academy/py/vector_index/hnsw", - badgeType: "practical", - learningGoals: [ - "How the HNSW index works and how to tune it for your use case." - ], - learningOutcomes: [ - "Describe how an HNSW index works and its key attributes.", - "Configure Weaviate with an HNSW index, and describe its parameters.", - ] - }, - "vindex_flat": { - title: "Flat index in depth", - body: "Learn about the flat index type, and how to tune it for your use case.", - buttonType: "Click here", - buttonURL: "/academy/py/vector_index/flat", - badgeType: "practical", - learningGoals: [ - "How the flat index works and how to tune it for your use case." - ], - learningOutcomes: [ - "Describe how a flat index works and its key attributes.", - "Configure Weaviate with a flat index, and describe its parameters.", - ] - }, - "vindex_dynamic": { - title: "Dynamic index in depth", - body: "Learn about the dynamic index type, and how to tune it for your use case.", - buttonType: "Click here", - buttonURL: "/academy/py/vector_index/dynamic", - badgeType: "practical", - learningGoals: [ - "How the dynamic index works and how to tune it for your use case." - ], - learningOutcomes: [ - "Describe how a dynamic index works and its key attributes.", - "Configure Weaviate with a dynamic index, and describe its parameters.", - ] - }, - "tokenization_basics": { - title: "Overview of tokenization", - body: "What is tokenization, and why is it important?", - buttonType: "Click here", - buttonURL: "/academy/py/tokenization/basics", - badgeType: "theory", - learningGoals: [ - "What tokenization is and its impact." - ], - learningOutcomes: [ - "Describe what tokenization is.", - "Differentiate the tokenizer's role in vectorization and keyword-based operations.", - ] - }, - "tokenization_options": { - title: "Available tokenization options", - body: "What tokenization options are available in Weaviate?", - buttonType: "Click here", - buttonURL: "/academy/py/tokenization/options", - badgeType: "theory", - learningGoals: [ - "Review the available tokenization options in Weaviate." - ], - learningOutcomes: [ - "Describe different tokenization options available in Weaviate.", - "Identify the best tokenization option for a given use case.", - "Identify languages for which specific tokenization options are available.", - ] - }, - "tokenization_filters": { - title: "Tokenization and filters", - body: "See how tokenization impacts filters.", - buttonType: "Click here", - buttonURL: "/academy/py/tokenization/filters", - badgeType: "practical", - learningGoals: [ - "The impact of tokenization on filters." - ], - learningOutcomes: [ - "Describe the ways in which tokenization can affect filters.", - "Implement an appropriate tokenization option in Weaviate for a given use case.", - ] - }, - "tokenization_searches": { - title: "Tokenization and searches", - body: "See how tokenization impacts searches.", - buttonType: "Click here", - buttonURL: "/academy/py/tokenization/searches", - badgeType: "practical", - learningGoals: [ - "The impact of tokenization on query results." - ], - learningOutcomes: [ - "Describe the ways in which tokenization can affect keyword search results.", - "Implement an appropriate tokenization option in Weaviate for a given use case.", - ] - }, - mt_overview: { - "title": "An overview of multi-tenancy", - "body": "Learn what multi-tenancy is and how it helps to build scalable production applications.", - "buttonType": "Click here", - "buttonURL": "/academy/py/multitenancy/overview", - "badgeType": "theory", - "learningGoals": [ - "Understand multi-tenancy in Weaviate and its benefits", - "Identify use cases for multi-tenancy in production applications" - ], - "learningOutcomes": [ - "Describe what multi-tenancy is in the context of Weaviate", - "Explain the benefits of multi-tenancy for resource management and scalability", - "Identify scenarios where multi-tenancy is advantageous", - "Understand the concept of tenant isolation within a shared collection" - ], - owner: "jp", - reviewer: "jp" - }, - mt_setup: { - "title": "Multi-tenant setup", - "body": "Set up a Weaviate instance and configure collections for multi-tenant usage.", - "buttonType": "Click here", - "buttonURL": "/academy/py/multitenancy/setup", - "badgeType": "practical", - "learningGoals": [ - "Learn how to set up a Weaviate instance and configure collections for multi-tenant usage." - ], - "learningOutcomes": [ - "Configure a Weaviate instance for multi-tenancy", - "Create a multi-tenant collection with appropriate settings", - "Understand and configure auto-tenant creation and activation", - "Set up dynamic indexing for efficient resource usage across tenants" - ], - owner: "jp", - reviewer: "jp" - }, - mt_tenant_data: { - "title": "Work with tenants and data", - "body": "Learn how to manage tenants and their data in a multi-tenant collection.", - "buttonType": "Click here", - "buttonURL": "/academy/py/multitenancy/tenant_data", - "badgeType": "practical", - "learningGoals": [ - "Understand how to create and manage tenants as well their data in a multi-tenant environment" - ], - "learningOutcomes": [ - "Create single and multiple tenants", - "Insert and batch import data for specific tenants", - "Perform queries on tenant-specific data", - "Understand and utilize auto-tenant creation" - ], - "owner": "jp", - "reviewer": "jp" - }, - mt_manage_tenants: { - "title": "Manage tenant states and resources", - "body": "Learn how to efficiently manage tenant states and optimize resource usage.", - "buttonType": "Click here", - "buttonURL": "/academy/py/multitenancy/manage_tenants", - "badgeType": "practical", - "learningGoals": [ - "Understand tenant activity statuses and their implications, and learn how to manage tenant states for optimal resource usage" - ], - "learningOutcomes": [ - "Describe the different tenant activity statuses (ACTIVE, INACTIVE, OFFLOADED)", - "Update tenant states individually and in bulk", - "Implement tenant offloading to cold storage", - "Utilize auto-activation for efficient resource management", - "Remove tenants and their associated data" - ], - "owner": "jp", - "reviewer": "jp" - }, - "ai_models_deep_dive": { - title: "AI models: A gentle deep dive", - body: "A practical introduction to AI models for software engineers or AI builders.", - buttonType: "Click here", - buttonURL: "/academy/theory/ai_models_deep_dive", - badgeType: "theory", - learningGoals: [ - "What AI models are and what they do", - "Overview of model architecture & training", - "Common types of models (generative and embedding)", - "Examples of models and use cases", - "How to access AI models (with code example)", - ], - learningOutcomes: [ - "Explain the core concepts of AI model architecture and training", - "Describe multiple model types and their appropriate use cases", - "Implement basic code for accessing models", - "Interpret key details of model cards or specifications", - ], - owner: "jp", - reviewer: "jp" - }, - "embedding_model_selection": { - title: "Embedding model evaluation & selection", - body: "A practical guide to choosing embedding models for your use case.", - buttonType: "Click here", - buttonURL: "/academy/theory/embedding_model_selection", - badgeType: "theory", - learningGoals: [ - "The practical impact of embedding model selection on AI application performance", - "A systematic, evidence-based approach to embedding model selection", - "Skills to evaluate, implement, and maintain embedding models in production systems" - ], - learningOutcomes: [ - "Evaluate embedding models based on quality, performance, cost, and resource requirements", - "Apply selection framework to identify, screen, and select embedding models", - "Design and implement effective evaluation strategies", - "Articulate key monitoring and maintenance needs for embedding models", - "Optimize embedding model selection for domain-specific applications" - ], - owner: "jp", - reviewer: "jp" - }, - hello_weaviate: { - title: "Hello, Weaviate", - body: "Start here: Learn what Weaviate is, and about its key capabilities and features, as well as about vectors that power Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/py/zero_to_mvp/hello_weaviate", - badgeType: "mixed", - learningGoals: [ - "What Weaviate is, and what it does.", - "How to create your own Weaviate instance on WCD.", - "Weaviate clients and how to install them.", - "Hands-on experience with Weaviate.", - ], - learningOutcomes: [ - "Broadly describe what Weaviate is.", - "Outline what vector search is.", - "Create a Weaviate instance on WCD.", - "Install your preferred Weaviate client.", - "Describe some of Weaviate's capabilities.", - ], - owner: "jp", - reviewer: "jp" - }, - queries_1: { - title: "Queries 1", - body: "Learn how queries work in Weaviate, how to use similarity searches and use filters, as well as how search works under the hood.", - buttonType: "Click here", - badgeType: "practical", - buttonURL: "/academy/py/zero_to_mvp/queries_1", - learningGoals: [ - "How to retrieve objects and properties.", - "The structure of returned responses from Weaviate.", - "The difference between `nearVector`, `nearObject` and `nearText`.", - "How to aggregate meta information about objects.", - "How to add filters to vector searches.", - "Weaviate's internal vector search process.", - ], - learningOutcomes: [ - "Construct 'Get' queries to retrieve relevant objects and desired properties.", - "Parse a response from Weaviate.", - "Explain the differences between `nearVector`, `nearObject` and `nearText`.", - "Construct 'Aggregate' queries to retrieve aggregated properties about relevant objects.", - "Add filters to queries to exclude certain results.", - "Describe how `nearObject` and `nearText` queries are converted to vector searches, and what pre-filtering is.", - ], - owner: "jp", - reviewer: "jp" - }, - schema_and_import: { // Separate the more difficult topics into their own units (schema 2 / import 2) - title: "Schema and imports", - body: "Learn what role the schema plays in Weaviate, and how to define it, before learning how to effectively populate Weaviate with data.", - buttonType: "Click here", - badgeType: "mixed", - buttonURL: "/academy/py/zero_to_mvp/schema_and_imports", - learningGoals: [ - "How Weaviate organizes and stores data.", - "An overview of indexes used in Weaviate.", - "What a schema is, and how to define it.", - "How to define classes and properties, including appropriate data types.", - "How to populate Weaviate with data.", - "Some best practices such as batch imports and additional properties.", - ], - learningOutcomes: [ - "Describe how the schema relates to organization and storage of data in Weaviate.", - "Broadly describe the role of indexes in Weaviate.", - "Understand how classes and properties represent your data.", - "Create a schema to suit your data.", - "Populate Weaviate with data, using batch imports.", - ], - owner: "jp", - reviewer: "jp" - }, - queries_2: { - title: "Queries 2", - body: "Learn about even more query types, from hybrid searches that combine keyword and vector searches to generative searches that transform your data at retrieval.", - buttonType: "Click here", - badgeType: "practical", - buttonURL: "/academy/py/zero_to_mvp/queries_2", - learningGoals: [ - "How to formulate and perform keyword or BM25 searches.", - "What Hybrid searches are, how they are ranked and how to use them.", - "How Generative searches utilize language models to transform data before delivery.", - "How you can extract the exact answers from data with the Question & Answer (QnA) module.", - ], - learningOutcomes: [ - "Perform BM25 and hybrid searches.", - "Differentiate between vector, BM25 and hybrid searches.", - "Transform data before delivery with generative searches.", - "Extract answers from data with QnA searches.", - ], - owner: "jp", - reviewer: "jp" - }, - - which_search: { - title: "Which search is right for me?", - body: "Weaviate offers many search types (vector, keyword and hybrid), and options. Let's discuss some good, baseline search strategies.", - buttonType: "Click here", - badgeType: "mixed", - buttonURL: "/academy/py/standalone/which_search", - learningGoals: [ - "Impact of search type on search quality.", - "Impact of search type on search performance.", - "How the dataset and chunking affect search", - "Key considerations for selecting a search type.", - "Strategies to apply to improve search quality.", - ], - learningOutcomes: [ - "Broadly recite pros and cons of each search type (vector, keyword and hybrid).", - "Suggest a suitable search type given a description of the dataset and aim.", - "Suggest alternative or additional search strategies to improve search quality.", - "Outline broad methods to evaluate search quality." - ], - owner: "jp", - reviewer: "jp" - }, - schema_design: { - title: "Collection schema design", - body: "How to design your collection data structure. Whether to use classes or multi-tenancy, or cross-references.", - buttonType: "TBD", - badgeType: "theory", - // buttonURL: "/academy/py/building_with_weaviate/schema_design", - learningGoals: [ - "Starting suggestions for selecting appropriate data structures to have Weaviate work for your needs.", - ], - learningOutcomes: [ - "Outline what collection schema is for.", - "Describe when to use single or multi-tenancy collections.", - "Explain the impact of cross-references on data, and queries.", - "Make informed choices on whether to use cross-references.", - ], - owner: "jp", - reviewer: "jp" - }, - vectorizer_selection: { - title: "Vectorizer selection", - body: "The basics on how to select a good baseline vectorizer for given data and task types.", - buttonType: "TBD", - badgeType: "theory", - // buttonURL: "/academy/py/building_with_weaviate/vectorizer_selection", - learningGoals: [ - "Theory and heuristics for selecting appropriate, robust vectorizers for the data type and task at hand and how to set the vectorizer appropriately in Weaviate.", - ], - learningOutcomes: [ - "Describe key considerations in vectorizer selection.", - "List types of vectorizer modules available with Weaviate.", - "Identify key differences between using an inference service and a local model.", - "Select an appropriate vectorizer model for a given data and task type.", - "Set the vectorizer for the data collection.", - ], - owner: "jp", - reviewer: "jp" - }, - indexing: { - title: "Indexing (Advanced)", - buttonType: "TBD", - body: "Learn how Weaviate indexes data, and how to balance search quality with speed.", - badgeType: "theory", - learningGoals: [ - "How Weaviate indexes data.", - "Weaviate's inverted and vector indexes.", - "The available vector index algorithms.", - "Tunable vector index parameters to balance search quality and performance." - ], - learningOutcomes: [ - "Describe how data indexing works within Weaviate.", - "Recognize and describe the different indexes.", - "Understand available .", - "Understand and use vector indexing parameters to balance search quality and performance.", - "Strategies for troubleshooting low search quality or performance.", - ], - owner: "jp", - reviewer: "jp" - }, - - - // vectorization_essentials: { - // title: "Vectorization Essentials", - // body: "Learn about vectors including how to create and use them.", - // buttonType: "Click here", - // badgeType: "mixed", - // buttonURL: "/academy/py/units/vectorization_essentials", - // learningGoals: [ - // "Various options for providing vector embeddings.", - // "Important class and property parameters, including cross-references.", - // "The relationship between the data structure and searches.", - // "How vectors and deep learning models enable semantic search.", - // "How Weaviate converts data into vectors.", - // "Weaviate's text vectorization process.", - // "Basics of vectorizer selection for Weaviate.", - // ], - // learningOutcomes: [ - // "Different options for providing vector embeddings in Weaviate", - // "Understand how the data structure affects search capabilities and results.", - // "Describe the use of vectors and deep learning models in semantic search.", - // "Broadly explain the process by which Weaviate converts data into vectors.", - // "Understand the fundamental concepts of text vectorization.", - // "Outline why vectorizer selection is important for effective search.", - // "Broadly outline available vectorizer types in Weaviate.", - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // imports_in_detail: { - // title: "Data import", - // body: "How to efficiently import data into Weaviate.", - // buttonType: "Click here", - // badgeType: "practical", - // buttonURL: "/academy/py/units/data_import", - // learningGoals: [ - // "The ability to populate Weaviate with data, including an understanding of suggested best practices such as batching, error diagnoses and handling.", - // ], - // learningOutcomes: [ - // "Use upload, retrieve and change operations to a data schema in Weaviate.", - // "Describe available batching parameters and their purpose.", - // "Distinguish between object-level and batch-level errors occurring during import.", - // "Identify object-level import errors and implement error handling through the Weaviate client.", - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // modules: { - // title: "Modules", - // body: "What roles modules play, and how to enable and use them, including examples.", - // buttonType: "TBD", - // buttonURL: "/academy/py/units/modules", - // badgeType: "mixed", - // learningGoals: [ - // "Learn that Weaviate is fully modularized", - // "Learn what modules are, how to choose, enable and use them.", - // "Learn the difference between vectorization modules and reader/generator modules" - // ], - // learningOutcomes: [ - // "Explain the different types of modules that you can use with Weaviate.", - // "Select the right modules for your use case, and enable them for your Weaviate instance." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - - - - - - // schema_2: { - // title: "Schema 2 (Advanced)", - // body: "Implement cross-references, and modify indexing options through the schema.", - // buttonType: "TBD", - // badgeType: "practical", - // learningGoals: [ - // "Learn how to implement cross-references between data objects.", - // "Learn about indexing options and how to set this in a schema.", - // "Learn about module-specific schema settings and how to modify this." - // ], - // learningOutcomes: [ - // "Create cross-references between data classes in the schema.", - // "Use different indexing settings for data objects in the schema.", - // "Use module-specific settings for data objects in the schema." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // t2v_under_hood: { - // title: "Text2vec under the hood", - // body: "Find out exactly how Weaviate vectorizes text, and how to modify its behavior.", - // buttonType: "TBD", - // badgeType: "theory", - // learningGoals: [ - // "Understand Weaviate's default object vectorization behavior in terms of data types and order of text concatenation, and how to modify this" - // ], - // learningOutcomes: [ - // "Explain the processes Weaviate employs to pre-process text before vectorization.", - // "Demonstrate where to locate the exact references for data types and concatenation order in the documentation.", - // "Implement manual vectorization (e.g. via OpenAI API) to reproduce a vector produced by Weaviate.", - // "Formulate a vectorization strategy and defend its reasoning given a data object and a goal." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // vectorizer_selection_2: { - // title: "Vectorizer selection 2", - // body: "Get into the weeds in model selection: how to look for a model that best suits your use case.", - // buttonType: "TBD", - // badgeType: "theory", - // learningGoals: [ - // "" - // ], - // learningOutcomes: [ - // "" - // ] - // }, - // custom_models: { - // title: "Custom models with Weaviate", - // body: "How you can combine your custom vectorizer model with Weaviate.", - // buttonType: "TBD", - // badgeType: "mixed", - // learningGoals: [ - // "" - // ], - // learningOutcomes: [ - // "" - // ] - // }, - // module_building: { - // title: "Module building", - // body: "You can extend Weaviate's capabilities with custom modules. Learn how to build one to fit your needs.", - // buttonType: "TBD", - // badgeType: "practical", - // learningGoals: [ - // "" - // ], - // learningOutcomes: [ - // "" - // ] - // }, - // backups: { - // title: "Backups", - // body: "How to back up and restore data in Weaviate: try partial and full backups to local or cloud storage.", - // buttonType: "TBD", - // badgeType: "practical", - // learningGoals: [ - // "Learn how to back up and restore data in Weaviate, both partial and full backups to local and cloud storage. " - // ], - // learningOutcomes: [ - // "Create partial and full backups of a Weaviate instance.", - // "Restore backups of a Weaviate instance into another instance." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // auth: { - // title: "Authentication & Authorization", - // body: "Identify users and control access with OpenID Connect (OIDC).", - // buttonType: "TBD", - // badgeType: "practical", - // learningGoals: [ - // "An overview of authentication and authorization, as well as how to implement token-based authentication and authorization in Weaviate." - // ], - // learningOutcomes: [ - // "Describe the principles behind token-based security such as OIDC / OAuth.", - // "Differentiate between an ID token and access token", - // "Implement OIDC-based authentication with Weaviate using Weaviate Cloud as the identity provider.", - // "Implement authorization with Weaviate based on OIDC authentication." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // scaling: { - // title: "Scaling", - // body: "What to expect and consider when scaling Weaviate to production.", - // buttonType: "TBD", - // badgeType: "theory", - // learningGoals: [ - // "" - // ], - // learningOutcomes: [ - // "" - // ] - // }, - // replication: { - // title: "Replication", - // body: "What to consider when adding replication, and how to implement it.", - // buttonType: "TBD", - // badgeType: "mixed", - // learningGoals: [ - // "Learn what replication is, what to consider and how to implement it for your Weaviate instance." - // ], - // learningOutcomes: [ - // "Describe what replication is, when to use it and how replication is designed in Weaviate.", - // "Select the correct replication factor and consistency levels for read and write operations for various use cases.", - // "Create a replicated Weaviate setup through settings in the schema.", - // "Create queries to retrieve data with various consistency levels." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // clients: { - // title: "Weaviate Clients", - // body: "An overview: what's available, where to find them, and their capabilities.", - // buttonType: "TBD", - // badgeType: "mixed", - // learningGoals: [ - // "Learn what Weaviate client libraries offer, which client languages are available and how to use them." - // ], - // learningOutcomes: [ - // "Understand where to find which Weaviate clients are available.", - // "Understand the capabilities of the Weaviate clients.", - // "Use a client to interact with Weaviate's API endpoints." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // docker: { - // title: "Weaviate with Docker", - // body: "How to run Weaviate on Docker, and best practice tips.", - // buttonType: "TBD", - // badgeType: "practical", - // learningGoals: [ - // "" - // ], - // learningOutcomes: [ - // "" - // ] - // }, - // reader_generator: { - // title: "Reader and Generator modules", - // body: "Overview of question-answering, summarization and named entity recognition modules.", - // buttonType: "TBD", - // badgeType: "mixed", - // learningGoals: [ - // "Learn about reader and generator modules like question answering, summerization and NER in a Weaviate pipeline, and how to use them." - // ], - // learningOutcomes: [ - // "Describe what reader and generator modules are.", - // "Choose fitting reader and/or generator modules for various use cases.", - // "Use third party (HuggingFace, OpenAI) reader and/or generator models in a Weaviate setup.", - // "Use reader and generator modules in GraphQL queries." - // ], - // owner: "jp", - // reviewer: "jp" - // }, - - chunking: { - title: "Document chunking - why and how?", - body: "Chunking is essential for working with longer texts in vector databases. This unit covers how to use it as well as tips and best practices.", - buttonType: "Click here", - buttonURL: "/academy/py/standalone/chunking", - badgeType: "practical", - learningGoals: [ - "What chunking is", - "Its role in vector search and generative search", - "Various chunking methods", - "Key considerations and suggested starting points", - ], - learningOutcomes: [ - "Describe chunking at a high level", - "Explain the impact of chunking in vector search and retrieval augmented generation", - "Implement various chunking methods and know where to explore others", - "Evaluate chunking strategies based on your needs", - ], - owner: "jp", - reviewer: "jp" - }, - kubernetes_intro: { - title: "Weaviate with Kubernetes - An introduction", - body: "What is Kubernetes, and how do you configure and run Weaviate on it?.", - buttonType: "Click here", - badgeType: "practical", - learningGoals: [ - "What Kubernetes is, and why it is used.", - "How to set up a local Kubernetes cluster.", - "Run a Weaviate instance on the local Kubernetes cluster.", - "How to configure Weaviate running on Kubernetes." - ], - learningOutcomes: [ - "Describe what Kubernetes is at a high level", - "Set up minikube on your local device and run a multi-node cluster.", - "Deploy Weaviate with replication on the multi-node cluster.", - "Modify configurations and perform a rolling update on the Weaviate instance.", - ] - }, - // vectorizer_text_overview: { - // title: "Text vectorizers: An overview", - // body: "An overview of models - from bag-of-words to word2vec and all the way to transformers.", - // buttonType: "TBD", - // badgeType: "mixed", - // learningGoals: [ - // "A brief history of text vectorization in modern natural language processing to provide context for their development including pros and cons." - // ], - // learningOutcomes: [ - // "Describe each of bag-of-words, word-based, and transformer models as well as RNN and LSTM models at a high level.", - // "Categorize well-known models and methods such as TF-IDF, BM25, text2vec, GloVe, FastText, BERT, GPT and CLIP to a model type.", - // "Distinguish key differences between each model types as well as key limitations or challenges for each model type.", - // ], - // owner: "jp", - // reviewer: "jp" - // }, - // { - // title: "Placeholder", - // body: "Something something dark side", - // buttonType: "TBD", - // badgeType: "mixed", - // learningGoals: [ - // "TBC" - // ], - // learningOutcomes: [ - // "TBC" - // ] - // }, - - intro_weaviate_typescript: { - title: "Introduction to Weaviate with TS (or JS)", - body: "A practical course where you can learn how to add Weaviate to a TypeScript (or JavaScript) app.", - buttonType: "Click here", - buttonURL: "/academy/js/intro_weaviate_typescript", - badgeType: "practical", - learningGoals: [ - "The basics of Weaviate, and how to integrate it to a TypeScript (or JavaScript) app." - ], - learningOutcomes: [ - "Create a cloud (WCD) instance fo Weaviate.", - "Gain an understanding of what a vector database is.", - "Define a schema (collection definition) and import data.", - "Perform queries on your data.", - "Integrate Weaviate into your TypeScript/JavaScript app.", - ], - owner: "jp", - reviewer: "jp" - }, - client_server: { - title: "Building client-server applications", - body: "Learn how to build fullstack applications with the Weaviate Typescript client that use the client-server approach with modern web tools.", - buttonType: "Click here", - buttonURL: "/academy/js/standalone/client-server", - badgeType: "practical", - learningGoals: [ - "What the client-server approach is and why we use it.", - "Implementing Semantic search in Fullstack Web frameworks with Weaviate using the client-server approach.", - "Implementing Semantic search in Backend Web frameworks with Weaviate using the client-server approach.", - ], - learningOutcomes: [ - "Explain where in the client-server architecture of your Web Applications Weaviate can be used.", - "Integrate semantic search functionality into new or existing Backend and Fullstack Web Applications.", - "Initialize and use Weaviate in your Fullstack or Backend Web Frameworks." - ], - owner: "daniel", - reviewer: "daniel" - }, - using_ml_models: { - title: "Using Machine Learning Models", - body: "Understand the fundamental concepts of generative and embedding models, their configuration and application in Weaviate.", - buttonType: "Click here", - buttonURL: "/academy/js/standalone/using-ml-models", - badgeType: "mixed", - learningGoals: [ - "A high level understanding of embedding and generative models.", - "Distinguish between text and multimodal embedding types.", - "Configuring Weaviate to use embedding and generative models.", - "Making semantic and generative searches in Weaviate using JavaScript.", - ], - learningOutcomes: [ - "Differentiate between embedding and generative machine learning models.", - "Configure Weaviate to use text and multimodal embedding models for semantic search.", - "Configure Weaviate to use supported generative models for generative search." - ], - owner: "daniel", - reviewer: "daniel" - }, - want_stack: { - title: "Understanding the WANT stack?", - body: "Chunking is essential for working with longer texts in vector databases. This unit covers how to use it as well as tips and best practices.", - buttonType: "Click here", - buttonURL: "/academy/js/standalone/want-stack", - badgeType: "practical", - learningGoals: [ - "What chunking is", - ], - learningOutcomes: [ - "Describe chunking at a high level", - "Explain the impact of chunking in vector search and retrieval augmented generation", - ], - owner: "daniel", - reviewer: "daniel" - }, - which_search_js: { - title: "Which search is right for me?", - body: "Weaviate offers many search types (vector, keyword and hybrid), and options. Let's discuss some good, baseline search strategies.", - buttonType: "Click here", - badgeType: "mixed", - buttonURL: "/academy/js/standalone/which-search", - learningGoals: [ - "Impact of search type on search quality.", - "Impact of search type on search performance.", - "How the dataset and chunking affect search", - "Key considerations for selecting a search type.", - "Strategies to apply to improve search quality.", - ], - learningOutcomes: [ - "Broadly recite pros and cons of each search type (vector, keyword and hybrid).", - "Suggest a suitable search type given a description of the dataset and aim.", - "Suggest alternative or additional search strategies to improve search quality.", - "Outline broad methods to evaluate search quality." - ], - owner: "daniel", - reviewer: "daniel" - } -}; diff --git a/src/components/Academy/unitcards.jsx b/src/components/Academy/unitcards.jsx deleted file mode 100644 index 2e88079b6..000000000 --- a/src/components/Academy/unitcards.jsx +++ /dev/null @@ -1,27 +0,0 @@ -import React from 'react'; -import AcademyCard from './card'; -import './academy.css' - -function CardSet(props) { - - let cardData = props.cardData; - let cardItems = []; - - for (let i = 0; i < cardData.length; i++) { - cardItems.push( -
    - -
    - ) - }; - - return (
    {cardItems}
    ) -} - -export default CardSet; diff --git a/src/components/Academy/units.jsx b/src/components/Academy/units.jsx deleted file mode 100644 index 69b0246b2..000000000 --- a/src/components/Academy/units.jsx +++ /dev/null @@ -1,31 +0,0 @@ -import React from "react"; -import UnitCardSet from "./unitcards"; -import { unitData } from '/src/components/Academy/unitData.js' -import './academy.css' - -function Units(props) { - - let cardData = props.courseData; - let courseName = props.courseName; - let cardItems = []; - - for (let k in cardData) { - let units = cardData[k].units.map(d => unitData[d]) - - cardItems.push( -
    -
    -

    {cardData[k].courseId}. {cardData[k].title}

    -

    {cardData[k].body}

    -
    -
    - -
    -
    - ) - }; - - return (
    {cardItems}
    ) -} - -export default Units; \ No newline at end of file diff --git a/src/components/AcademyAdmonition/courses.json b/src/components/AcademyAdmonition/courses.json new file mode 100644 index 000000000..f9d7aabaa --- /dev/null +++ b/src/components/AcademyAdmonition/courses.json @@ -0,0 +1,44 @@ +[ + { + "id": "quick-tour-of-weaviate", + "title": "A Quick Tour of Weaviate", + "description": "Become familiar with Weaviate's architecture, core concepts, and key capabilities. Understand how its features and integrations map to AI builders' needs.", + "url": "https://academy.weaviate.io/courses/wa050-py" + }, + { + "id": "weaviate-with-python", + "title": "Hands-on Weaviate with Python", + "description": "Build your first Weaviate project to store and search data, and perform retrieval-augmented generation (RAG) with generative AI models.", + "url": "https://academy.weaviate.io/courses/wa101t-py" + }, + { + "id": "ai-models-deep-dive", + "title": "A Gentle Deep Dive into AI Models", + "description": "A detailed overview of AI models from the perspective of application builders and developers. Learn how to distinguish between models and why model selection matters.", + "url": "https://academy.weaviate.io/courses/wa150-py" + }, + { + "id": "fast-api-weaviate-rag", + "title": "Your First AI App (Search and RAG)", + "description": "A hands-on course where you will build a movie recommendation API with Weaviate and FastAPI.", + "url": "https://academy.weaviate.io/courses/wa180-py" + }, + { + "id": "architectural-decisions-guide", + "title": "Architectural Decisions Guide", + "description": "Learn what the most important architectural decisions are when building applications with Weaviate, and how to make the right choices for your use case.", + "url": "https://academy.weaviate.io/courses/wa210-py" + }, + { + "id": "embedding-model-evaluation", + "title": "Embedding Model Evaluation & Selection", + "description": "Embedding models are the heart of vector search. Learn how to evaluate and select appropriate embedding models for your use case.", + "url": "https://academy.weaviate.io/courses/wa150-py" + }, + { + "id": "async-python-client-usage", + "title": "Async Python Client Usage", + "description": "Learn how to use Weaviate's async capabilities to handle concurrent operations and large-scale data processing efficiently.", + "url": "https://academy.weaviate.io/courses/wa280-py" + } +] diff --git a/src/components/AcademyAdmonition/index.jsx b/src/components/AcademyAdmonition/index.jsx new file mode 100644 index 000000000..fdb350a7a --- /dev/null +++ b/src/components/AcademyAdmonition/index.jsx @@ -0,0 +1,133 @@ +// src/components/AcademyAdmonition/index.jsx +import React from "react"; +import styles from "./styles.module.scss"; +import useBaseUrl from "@docusaurus/useBaseUrl"; +import { useColorMode } from "@docusaurus/theme-common"; +import courses from "./courses.json"; + +const AcademyAdmonition = ({ + courseId, + buttonText = "Open Academy Course", + // Allow optional overrides + customTitle, + customDescription, + customUrl, +}) => { + const { colorMode } = useColorMode(); + const isDarkMode = colorMode === "dark"; + + // Find course by ID + const course = courses.find((c) => c.id === courseId); + + // If course not found, show error in development + if (!course && !customUrl) { + if (process.env.NODE_ENV === "development") { + return ( +
    +

    + ⚠️ Academy course not found: {courseId} +

    +

    + Available IDs: {courses.map((c) => c.id).join(", ")} +

    +
    + ); + } + // In production, return null to not render anything + return null; + } + + // Use course data or custom overrides + const title = customTitle || course?.title; + const description = customDescription || course?.description; + const url = customUrl || course?.url; + + // If still no URL, don't render + if (!url) return null; + + // Switch logo based on theme + const logoPath = isDarkMode + ? "/img/docs/weaviate-academy-white.png" + : "/img/docs/weaviate-academy-purple.png"; + + const academyLogoUrl = useBaseUrl(logoPath); + + return ( +
    +
    + Weaviate Academy +

    Course: {title}

    +
    + +
    +

    {description}

    + + + {buttonText} + + + + +
    +
    + ); +}; + +export default AcademyAdmonition; + +// ============================================ +// Example usage in MDX files: +/* +import AcademyAdmonition from '@site/src/components/AcademyAdmonition'; + +// Simple usage with courseId from courses.json + + +// With custom button text + + +// Override with custom content (for courses not in JSON) + +*/ diff --git a/src/components/AcademyAdmonition/styles.module.scss b/src/components/AcademyAdmonition/styles.module.scss new file mode 100644 index 000000000..c17237589 --- /dev/null +++ b/src/components/AcademyAdmonition/styles.module.scss @@ -0,0 +1,211 @@ +/* src/components/AcademyAdmonition/styles.module.scss */ + +.academyAdmonition { + background: linear-gradient( + 135deg, + rgba(107, 70, 193, 0.08) 0%, + rgba(76, 29, 149, 0.12) 100% + ); + border: 1px solid rgba(107, 70, 193, 0.2); + border-radius: 12px; + padding: 1.5rem; + margin: 2rem 0; + box-shadow: 0 4px 20px rgba(107, 70, 193, 0.08); + position: relative; + overflow: hidden; +} + +/* Decorative gradient overlay */ +.academyAdmonition::before { + content: ""; + position: absolute; + top: -50%; + right: -10%; + width: 40%; + height: 200%; + background: radial-gradient( + circle, + rgba(107, 70, 193, 0.05) 0%, + transparent 70% + ); + pointer-events: none; +} + +/* Header row with logo and title */ +.academyHeader { + display: flex; + align-items: center; + gap: 1rem; + margin-bottom: 1rem; + position: relative; + z-index: 1; +} + +.academyLogo { + width: 150px; + height: auto; + object-fit: contain; + flex-shrink: 0; +} + +.courseTitle { + color: #6b46c1; + font-size: 1.25rem; + font-weight: 700; + margin: 0; + line-height: 1.3; + flex: 1; +} + +/* Body row with description and button */ +.academyBody { + display: flex; + align-items: flex-start; + justify-content: space-between; + gap: 1.5rem; + position: relative; + z-index: 1; +} + +.courseDescription { + color: #4a5568; + font-size: 0.95rem; + line-height: 1.6; + margin: 0; + flex: 1; +} + +/* CTA Button */ +.academyButton { + display: inline-flex; + align-items: center; + gap: 0.5rem; + background: linear-gradient(135deg, #6b46c1 0%, #5b21b6 100%); + color: white; + padding: 0.625rem 1.25rem; + border-radius: 8px; + font-weight: 600; + font-size: 0.95rem; + text-decoration: none; + transition: all 0.3s ease; + box-shadow: 0 4px 14px rgba(107, 70, 193, 0.25); + flex-shrink: 0; + white-space: nowrap; +} + +.academyButton:hover { + background: linear-gradient(135deg, #7c3aed 0%, #6b46c1 100%); + transform: translateY(-2px); + box-shadow: 0 6px 20px rgba(107, 70, 193, 0.35); + text-decoration: none; + color: white; +} + +.buttonIcon { + transition: transform 0.3s ease; +} + +.academyButton:hover .buttonIcon { + transform: translateX(3px); +} + +/* Dark theme support - optimized for white logo */ +[data-theme="dark"] .academyAdmonition { + background: linear-gradient( + 135deg, + rgba(88, 44, 173, 0.2) 0%, + rgba(67, 24, 133, 0.25) 100% + ); + border: 1px solid rgba(124, 58, 237, 0.35); + box-shadow: 0 4px 20px rgba(88, 44, 173, 0.2); +} + +[data-theme="dark"] .academyAdmonition::before { + background: radial-gradient( + circle, + rgba(124, 58, 237, 0.1) 0%, + transparent 70% + ); +} + +[data-theme="dark"] .courseTitle { + color: #c4b5fd; +} + +[data-theme="dark"] .courseDescription { + color: #cbd5e1; +} + +[data-theme="dark"] .academyButton { + background: linear-gradient(135deg, #8b5cf6 0%, #7c3aed 100%); + color: white; + box-shadow: 0 4px 14px rgba(139, 92, 246, 0.35); +} + +[data-theme="dark"] .academyButton:hover { + background: linear-gradient(135deg, #a78bfa 0%, #8b5cf6 100%); + box-shadow: 0 6px 20px rgba(139, 92, 246, 0.45); +} + +/* Mobile responsive design */ +@media (max-width: 768px) { + .academyAdmonition { + padding: 1.25rem; + } + + .academyHeader { + gap: 0.75rem; + } + + .academyLogo { + width: 120px; + } + + .courseTitle { + font-size: 1.125rem; + } + + .academyBody { + flex-direction: column; + gap: 1rem; + } + + .courseDescription { + font-size: 0.875rem; + } + + .academyButton { + font-size: 0.875rem; + padding: 0.5rem 1rem; + align-self: flex-end; + } +} + +/* Small mobile responsive design */ +@media (max-width: 480px) { + .academyHeader { + flex-direction: column; + align-items: flex-start; + gap: 0.5rem; + } + + .academyLogo { + width: 120px; + } + + .academyButton { + align-self: stretch; + justify-content: center; + } +} + +/* Tablet responsive design */ +@media (max-width: 1024px) { + .academyAdmonition { + margin: 1.5rem 0; + } + + .academyLogo { + width: 120px; + } +} diff --git a/src/components/scriptSwitch/index.jsx b/src/components/scriptSwitch/index.jsx deleted file mode 100644 index 85a706bc0..000000000 --- a/src/components/scriptSwitch/index.jsx +++ /dev/null @@ -1,46 +0,0 @@ -import React, { useEffect } from 'react'; - -const CustomScriptLoader = () => { - useEffect(() => { - const loadScript = (src) => { - document - .querySelectorAll('img[referrerPolicy="no-referrer-when-downgrade"]') - .forEach((el) => el.remove()); - - const script = document.createElement('img'); - script.src = src; - script.referrerPolicy = 'no-referrer-when-downgrade'; - script.style = 'display: none;'; - document.body.appendChild(script); - }; - - const getScriptForPath = (pathname) => { - const scriptsMap = { - '/docs/': - 'https://static.scarf.sh/a.png?x-pxid=2758e82f-6546-4356-a8bd-5b5c16368efb', - '/pricing': - 'https://static.scarf.sh/a.png?x-pxid=5c79460c-47af-4477-a1d9-3624dcce35d3', - }; - - const defaultScript = - 'https://static.scarf.sh/a.png?x-pxid=a41b0758-a3a9-4874-a880-8b5d5a363d40'; - - // Check for a direct match or base path match (for handling subpages) - for (const [basePath, script] of Object.entries(scriptsMap)) { - if (pathname === basePath || pathname.startsWith(basePath)) { - return script; - } - } - - return defaultScript; - }; - - const currentPath = window.location.pathname; - const scriptToLoad = getScriptForPath(currentPath); - loadScript(scriptToLoad); - }, []); - - return null; -}; - -export default CustomScriptLoader; diff --git a/src/css/blog-and-docs.scss b/src/css/blog-and-docs.scss index 213790c8a..efbdf7636 100644 --- a/src/css/blog-and-docs.scss +++ b/src/css/blog-and-docs.scss @@ -202,27 +202,6 @@ a.navbar__item.navbar__link[href='/weaviate/api/rest'] { } } -.academy-img-lg { - display: block; - margin-left: auto; - margin-right: auto; - width: 95%; -} - -.academy-img-md { - display: block; - margin-left: auto; - margin-right: auto; - width: 60%; -} - -.academy-img-sm { - display: block; - margin-left: auto; - margin-right: auto; - width: 30%; -} - .callout { display: inline-flex; align-items: center; diff --git a/src/css/custom.scss b/src/css/custom.scss index 68bc06841..5246c678e 100644 --- a/src/css/custom.scss +++ b/src/css/custom.scss @@ -305,32 +305,92 @@ padding: 0px; } -.modern-button { - display: inline-flex; /* Flex for better content alignment */ - align-items: center; /* Vertically center content */ - justify-content: center; /* Horizontally center content */ - gap: 8px; /* Space between icon and text */ - padding: 6px 10px; /* Balanced padding */ - color: white; - font-weight: 600; /* Medium weight looks more refined */ - letter-spacing: 0.01em; /* Subtle letter spacing for clarity */ - border-radius: 8px; /* More rounded corners */ - background-color: var( - --site-primary - ); /* Original green background on hover */ - cursor: pointer; /* Pointer on hover */ - transition: all 0.2s ease; /* Smooth transition for all properties */ - box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); /* Very subtle initial shadow */ +.cloud-button { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 8px; + padding: 6px 12px; + color: var(--site-primary); // Use primary color for text instead of white + font-weight: 600; // Slightly lighter weight + letter-spacing: 0.01em; + border-radius: 8px; + background-color: rgba(0, 140, 58, 0.08); // Very light tinted background + border: 1px solid rgba(0, 161, 66, 0.15); // Subtle border + cursor: pointer; + transition: all 0.2s ease; + box-shadow: none; // Remove initial shadow for cleaner look + + @media screen and (max-width: 1150px) { + display: none; + } +} + +.cloud-button:hover { + color: var(--site-primary) !important; + background-color: rgba(var(--site-primary-rgb), 0.15); // Slightly darker on hover + border-color: rgba(var(--site-primary-rgb), 0.25); + transform: translateY(-0.5px); + box-shadow: 0 2px 6px rgba(0, 0, 0, 0.08); +} + +.academy-button { + display: inline-flex; + align-items: center; + justify-content: center; + gap: 8px; + padding: 6px 12px; + margin-right: 5px; + color: #6b46c1; // Purple text instead of white + font-weight: 500; // Slightly lighter weight + letter-spacing: 0.01em; + border-radius: 8px; + background: rgba(107, 70, 193, 0.08); // Light purple tint + border: 1px solid rgba(107, 70, 193, 0.15); // Subtle purple border + cursor: pointer; + transition: all 0.2s ease; + box-shadow: none; // Remove initial shadow @media screen and (max-width: 1150px) { display: none; } } -.modern-button:hover { - color: white !important; - transform: translateY(-0.5px); /* Subtle lift effect */ - box-shadow: 0 3px 8px rgba(0, 0, 0, 0.1); /* Subtle shadow on hover */ +.academy-button:hover { + color: #6b46c1 !important; + background: rgba(107, 70, 193, 0.15); // Slightly darker purple on hover + border-color: rgba(107, 70, 193, 0.25); + transform: translateY(-0.5px); + box-shadow: 0 2px 6px rgba(107, 70, 193, 0.15); +} + +// Dark mode adjustments +[data-theme='dark'] { + .cloud-button { + color: #86efac; // Light green for dark mode + background-color: rgba(134, 239, 172, 0.1); + border-color: rgba(134, 239, 172, 0.2); + + &:hover { + color: #86efac !important; + background-color: rgba(134, 239, 172, 0.15); + border-color: rgba(134, 239, 172, 0.3); + box-shadow: 0 2px 6px rgba(134, 239, 172, 0.1); + } + } + + .academy-button { + color: #c4b5fd; // Light purple for dark mode + background: rgba(196, 181, 253, 0.1); + border-color: rgba(196, 181, 253, 0.2); + + &:hover { + color: #c4b5fd !important; + background: rgba(196, 181, 253, 0.15); + border-color: rgba(196, 181, 253, 0.3); + box-shadow: 0 2px 6px rgba(196, 181, 253, 0.1); + } + } } [data-theme="light"] { diff --git a/src/css/variables.scss b/src/css/variables.scss index d6cfe1a09..2045dd52e 100644 --- a/src/css/variables.scss +++ b/src/css/variables.scss @@ -9,6 +9,7 @@ --site-base-color-shade: #2c3854; --site-primary: #00a142; + --site-primary-rgb: rgb(0, 161, 66); --site-primary-dark: #00c342; --site-accent: #fc3988; --site-accent-dark: #fb0b6d; diff --git a/src/theme/Root.js b/src/theme/Root.js index 2ae93acc2..a1b99a482 100644 --- a/src/theme/Root.js +++ b/src/theme/Root.js @@ -51,7 +51,7 @@ export default function Root({ children }) { script.setAttribute("data-font-size-xl", "1.35rem"); script.setAttribute( "data-modal-disclaimer", - "This is a custom LLM for Weaviate with access to all developer docs, Cloud docs, academy lessons, contributor guides, GitHub issues, and forum questions." + "This is a custom LLM for Weaviate with access to all developer docs, Cloud docs, contributor guides, GitHub issues, and forum questions." ); script.setAttribute("data-modal-disclaimer-bg-color", "white"); script.setAttribute("data-modal-disclaimer-text-color", "black"); diff --git a/static/img/docs/weaviate-academy-purple.png b/static/img/docs/weaviate-academy-purple.png new file mode 100644 index 000000000..d61ab19ad Binary files /dev/null and b/static/img/docs/weaviate-academy-purple.png differ diff --git a/static/img/docs/weaviate-academy-white.png b/static/img/docs/weaviate-academy-white.png new file mode 100644 index 000000000..b1d7a508f Binary files /dev/null and b/static/img/docs/weaviate-academy-white.png differ diff --git a/tests/test_academy.py b/tests/test_academy.py deleted file mode 100644 index 694960bc8..000000000 --- a/tests/test_academy.py +++ /dev/null @@ -1,80 +0,0 @@ -import pytest -import utils -from pathlib import Path - - -@pytest.mark.pyv4 -@pytest.mark.parametrize( - "script_loc", - [ - "./docs/academy/py/starter_text_data/_snippets/101_connect.py", - "./docs/academy/py/starter_text_data/_snippets/102_collection.py", - "./docs/academy/py/starter_text_data/_snippets/103_searches.py", - "./docs/academy/py/starter_text_data/_snippets/104_rag.py", - "./docs/academy/py/starter_custom_vectors/_snippets/101_connect.py", - "./docs/academy/py/starter_custom_vectors/_snippets/102_collection.py", - "./docs/academy/py/starter_custom_vectors/_snippets/103_10_vector.py", - "./docs/academy/py/starter_custom_vectors/_snippets/103_20_searches.py", - "./docs/academy/py/starter_custom_vectors/_snippets/104_rag.py", - "./docs/academy/py/starter_multimodal_data/_snippets/101_connect.py", - "./docs/academy/py/starter_multimodal_data/_snippets/102_collection.py", - "./docs/academy/py/starter_multimodal_data/_snippets/103_searches.py", - "./docs/academy/py/starter_multimodal_data/_snippets/104_rag.py", - "./docs/academy/py/named_vectors/_snippets/101_connect.py", - "./docs/academy/py/named_vectors/_snippets/102_collection.py", - "./docs/academy/py/named_vectors/_snippets/103_searches.py", - "./docs/academy/py/named_vectors/_snippets/104_usecase.py", - "./docs/academy/py/compression/_snippets/100_pq.py", - "./docs/academy/py/compression/_snippets/200_bq.py", - "./docs/academy/py/tokenization/_snippets/310_create_collection.py", - "./docs/academy/py/tokenization/_snippets/315_add_objects.py", - "./docs/academy/py/tokenization/_snippets/320_filters.py", - "./docs/academy/py/tokenization/_snippets/400_searches.py", - "./docs/academy/py/vector_index/_snippets/100_config.py", - ], -) -def test_on_blank_instance_pyv4(empty_weaviates, script_loc): - # proc_script = utils.load_and_prep_script(script_loc) - # exec(proc_script) - temp_proc_script_loc = utils.load_and_prep_temp_file( - script_loc, - lang="py", - custom_replace_pairs=utils.edu_readonly_replacements - ) - utils.execute_py_script_as_module(temp_proc_script_loc.read_text(), Path(script_loc).stem) - - -# Deprecated tests for deprecated modules (pyv3; also directories have moved) -# @pytest.mark.pyv3 -# @pytest.mark.parametrize( -# "script_loc", -# [ -# "./docs/academy/zero_to_mvp/_snippets/setup.py", -# "./docs/academy/zero_to_mvp/103_schema_and_imports/_snippets/05_create_instance.py", -# "./docs/academy/zero_to_mvp/103_schema_and_imports/_snippets/20_schema.py", -# "./docs/academy/zero_to_mvp/103_schema_and_imports/_snippets/30_import.py", -# "./docs/academy/zero_to_mvp/103_schema_and_imports/_snippets/40_import_example_1.py", -# ], -# ) -# def test_on_blank_instance(empty_weaviates, script_loc): -# proc_script = utils.load_and_prep_script(script_loc) -# exec(proc_script) - - -# @pytest.mark.pyv3 -# @pytest.mark.parametrize( -# "script_loc", -# [ -# "./docs/academy/zero_to_mvp/104_queries_2/_snippets/10_bm25.py", -# "./docs/academy/zero_to_mvp/104_queries_2/_snippets/20_hybrid.py", -# "./docs/academy/zero_to_mvp/104_queries_2/_snippets/30_generative.py", -# "./docs/academy/zero_to_mvp/104_queries_2/_snippets/40_qna.py", -# ], -# ) -# def test_against_edu_demo_pyv3(empty_weaviates, script_loc): -# temp_proc_script_loc = utils.load_and_prep_temp_file( -# script_loc, -# lang="py", -# custom_replace_pairs=utils.edu_readonly_replacements -# ) -# exec(temp_proc_script_loc.read_text())