airbytehq · ombhardwajj · Jul 2, 2024
diff --git a/vector_store_integration/RAG_USING_WEAVIATE.ipynb b/vector_store_integration/RAG_USING_WEAVIATE.ipynb
@@ -0,0 +1,236 @@
+{
+  "nbformat": 4,
+  "nbformat_minor": 0,
+  "metadata": {
+    "colab": {
+      "provenance": []
+    },
+    "kernelspec": {
+      "name": "python3",
+      "display_name": "Python 3"
+    },
+    "language_info": {
+      "name": "python"
+    }
+  },
+  "cells": [
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **End-to-End RAG Tutorial Using Salesforce, Airbyte Cloud, Weaviate, and LangChain**\n",
+        "This notebook illustrates the complete setup of a Retrieval-Augmented Generation (RAG) pipeline.<br>\n",
+        "We extract data from a GitHub repository using PyAirbyte, store the data in a Chroma vector store, and use LangChain to perform RAG on the stored data.<br>\n",
+        "## **Prerequisites**\n",
+        "**1) OpenAI API Key**:\n",
+        "   - **Create an OpenAI Account**: Sign up for an account on [OpenAI](https://www.openai.com/).\n",
+        "   - **Generate an API Key**: Go to the API section and generate a new API key. For detailed instructions, refer to the [OpenAI documentation](https://beta.openai.com/docs/quickstart).\n",
+        "\n",
+        "**2) Weaviate Cluster's Public URL and API Key**:\n",
+        "\n",
+        "   - **Get your URL and Key**: Cick on your clusters drop down button. Visit [this](https://console.weaviate.cloud/).\n"
+      ],
+      "metadata": {
+        "id": "sW7Fx5ilH2jK"
+      }
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Installing Dependencies**\n"
+      ],
+      "metadata": {
+        "id": "oFw9gVa9JKDH"
+      }
+    },
+    {
+      "cell_type": "code",
+      "execution_count": null,
+      "metadata": {
+        "id": "zAztxTpIBiMF"
+      },
+      "outputs": [],
+      "source": [
+        "# Add virtual environment support if needed\n",
+        "!apt-get install -qq python3.10-venv\n",
+        "\n",
+        "# Install required packages\n",
+        "%pip install --quiet openai langchain-openai tiktoken pandas weaviate-client langchain-weaviate langchain-community\n"
+      ]
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Set Up Environment Variables**"
+      ],
+      "metadata": {
+        "id": "wH5CoP0WJfjd"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import os\n",
+        "\n",
+        "os.environ[\"WEAVIATE_URL\"] = \"YOUR_WEAVIATE_URL\"\n",
+        "os.environ[\"WEAVIATE_API_KEY\"] = \"YOUR_WEAVIATE_API_KEY\"\n",
+        "os.environ[\"OPENAI_API_KEY\"] = \"YOUR_OPENAI_API_KEY\""
+      ],
+      "metadata": {
+        "id": "JDqAdjZTCbeB"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Initialize Weaviate Vector Store**"
+      ],
+      "metadata": {
+        "id": "jGEqM1-RJlTE"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "import weaviate\n",
+        "from weaviate.auth import AuthApiKey\n",
+        "from langchain_weaviate.vectorstores import WeaviateVectorStore\n",
+        "from langchain_community.document_loaders import TextLoader\n",
+        "from langchain_openai import OpenAIEmbeddings\n",
+        "from langchain_text_splitters import CharacterTextSplitter\n",
+        "import pandas as pd\n",
+        "\n",
+        "# Connect to Weaviate with API key\n",
+        "auth_config = AuthApiKey(api_key=os.getenv(\"WEAVIATE_API_KEY\"))\n",
+        "\n",
+        "try:\n",
+        "    weaviate_client = weaviate.Client(\n",
+        "        url=os.getenv(\"WEAVIATE_URL\"),\n",
+        "        auth_client_secret=auth_config,\n",
+        "    )\n",
+        "    print(\"Successfully connected to Weaviate\", flush=True)\n",
+        "except Exception as e:\n",
+        "    print(f\"Error connecting to Weaviate: {e}\", flush=True)\n",
+        "\n"
+      ],
+      "metadata": {
+        "id": "XnzOsbFaCepg"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Embedding and similarity search with Weaviate**\n",
+        "Here we will convert the user's query into embeddings using OpenAI and retrieve similar chunks from Weaviate based on the query. <br>\n",
+        "### Note: Change collection and property according to your own requirement!"
+      ],
+      "metadata": {
+        "id": "EPEEwKcgJtI_"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from langchain_openai import OpenAIEmbeddings\n",
+        "from typing import List\n",
+        "\n",
+        "# Initialize OpenAI client for embeddings\n",
+        "openai_embeddings = OpenAIEmbeddings(openai_api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+        "\n",
+        "# Convert user's query into a vector array to prep for similarity search\n",
+        "def get_embedding_from_openai(query) -> List[float]:\n",
+        "    return openai_embeddings.embed_query(query)\n",
+        "\n",
+        "# Use Weaviate to find matching chunks\n",
+        "collection=\"Lead\"\n",
+        "property=\"name\"\n",
+        "def get_similar_chunks_from_weaviate(query: str) -> List[str]:\n",
+        "    try:\n",
+        "        embedding = get_embedding_from_openai(query)\n",
+        "        near_vector = {\n",
+        "            \"vector\": embedding\n",
+        "        }\n",
+        "        result = weaviate_client.query.get(collection, [property]).with_near_vector(near_vector).do()\n",
+        "\n",
+        "        if 'data' in result and 'Get' in result['data'] and collection in result['data']['Get']:\n",
+        "            chunks = [res[property] for res in result['data']['Get'][collection]]\n",
+        "            return chunks\n",
+        "        else:\n",
+        "            print(\"Unexpected result format:\", result, flush=True)\n",
+        "            return []\n",
+        "    except Exception as e:\n",
+        "        print(f\"Error during Weaviate query: {e}\", flush=True)\n",
+        "        return []\n"
+      ],
+      "metadata": {
+        "id": "8SpJVDx4D23z"
+      },
+      "execution_count": null,
+      "outputs": []
+    },
+    {
+      "cell_type": "markdown",
+      "source": [
+        "# **Building RAG Pipeline and asking a question**\n",
+        "Finally we use OpenAI for querying our data! <br>\n",
+        "We know the three main steps of a RAG Pipeline are : <br>\n",
+        "- Embedding incoming query <br>\n",
+        "- Doing similarity search to find matching chunks <br>\n",
+        "- Send chunks to LLM for completion"
+      ],
+      "metadata": {
+        "id": "ubPGiUFSKBhF"
+      }
+    },
+    {
+      "cell_type": "code",
+      "source": [
+        "from typing import List\n",
+        "from openai import OpenAI\n",
+        "\n",
+        "client = OpenAI(api_key=os.getenv(\"OPENAI_API_KEY\"))\n",
+        "\n",
+        "# Use OpenAI to complete the response\n",
+        "def get_completion_from_openai(question, document_chunks: List[str], model_name=\"gpt-3.5-turbo\"):\n",
+        "    chunks = \"\\n\\n\".join(document_chunks)\n",
+        "\n",
+        "    try:\n",
+        "        completion = client.chat.completions.create(\n",
+        "            model=model_name,\n",
+        "            messages=[\n",
+        "                {\"role\": \"system\", \"content\": \"You are an assistant. Answer the question based on the context. Do not use any other information. Be concise.\"},\n",
+        "                {\"role\": \"user\", \"content\": f\"Context:\\n{chunks}\\n\\n{question}\\n\\nAnswer:\"}\n",
+        "            ],\n",
+        "            max_tokens=150\n",
+        "        )\n",
+        "        return completion.choices[0].message.content.strip()\n",
+        "    except Exception as e:\n",
+        "        print(f\"Error during OpenAI completion: {e}\", flush=True)\n",
+        "        return \"There was an error generating the response.\"\n",
+        "\n",
+        "\n",
+        "# Putting it all together\n",
+        "def get_response(query, model_name=\"gpt-3.5-turbo\"):\n",
+        "    chunks = get_similar_chunks_from_weaviate(query)\n",
+        "    if len(chunks) == 0:\n",
+        "        return \"I am sorry, I do not have the context to answer your question.\"\n",
+        "    else:\n",
+        "        return get_completion_from_openai(query, chunks, model_name)\n",
+        "\n",
+        "# Ask a question\n",
+        "query = 'How many lead work in BNY?'\n",
+        "response = get_response(query)\n",
+        "\n",
+        "print(f\"\\n\\nResponse from LLM:\\n\\n{response}\", flush=True)\n"
+      ],
+      "metadata": {
+        "id": "Qu1C3e1TE2YL"
+      },
+      "execution_count": null,
+      "outputs": []
+    }
+  ]
+}