From 816f322eabc49641f6786f8e64d95ee925d891f6 Mon Sep 17 00:00:00 2001 From: Pangeran Bottor Date: Sat, 29 Jun 2024 05:24:18 +0700 Subject: [PATCH 1/3] RAG with PyAirbyte Pinocone from Source File --- ...rce_file_with_pyairbyte_and_pinecone.ipynb | 395 ++++++++++++++++++ 1 file changed, 395 insertions(+) create mode 100644 rag_source_file_with_pyairbyte_and_pinecone.ipynb diff --git a/rag_source_file_with_pyairbyte_and_pinecone.ipynb b/rag_source_file_with_pyairbyte_and_pinecone.ipynb new file mode 100644 index 0000000..cc5d938 --- /dev/null +++ b/rag_source_file_with_pyairbyte_and_pinecone.ipynb @@ -0,0 +1,395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates simple RAG (Retrieval-Augmented Generation) pipeline with Pinecone and PyAirbyte.\n", + "The focus is to showcase how to use `source-file` on PyAirbyte." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "- [PyAirbyte](https://airbyte.com/product/pyairbyte)\n", + " \n", + " PyAirbyte is an open-source that packages Airbyte connectors and makes them available in Python. In this tutorial, we will use the \n", + " `source-file`\n", + "- [Pinecone](https://www.pinecone.io/)\n", + "- OpenAI API Key\n", + " \n", + " Go to the [API Keys page](https://platform.openai.com/api-keys) to create the new secret key." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup\n", + "\n", + "Install the dependencies and import them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install airbyte langchain langchain-openai langchain-pinecone langchainhub openai pinecone-client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_pinecone import PineconeVectorStore\n", + "from pinecone import Pinecone, ServerlessSpec\n", + "\n", + "import json\n", + "import os\n", + "import time\n", + "\n", + "\n", + "import airbyte as ab\n", + "\n", + "\n", + "# Set your API keys\n", + "PINECONE_API_KEY = 'your-pinecone-key'\n", + "PINECONE_ENVIRONMENT = 'us-east-1' # e.g., \"us-west1-gcp\"\n", + "OPENAI_API_KEY = 'sk-proj-xxxx'\n", + "\n", + "api_key = os.environ.get(\"PINECONE_API_KEY\")\n", + "os.environ[\"PINECONE_API_KEY\"] = PINECONE_API_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Source\n", + "\n", + "For this quickstart purpose we will extract CSV data related to reviews on a clothing brand that being hosted publicly so no need to create any credentials. Find more details about the data: https://cseweb.ucsd.edu/~jmcauley/pdfs/recsys18e.pdf.\n", + "\n", + "In this quickstart we extract data with `JSONL` format and it's being compressed, we will see how it reflects on the config below.\n", + "You can find the documentation related to source file specification for Airbyte [here](https://docs.airbyte.com/integrations/sources/file).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Connection check succeeded for `source-file`.\n",
+       "
\n" + ], + "text/plain": [ + "Connection check succeeded for `source-file`.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "source = ab.get_source(\n", + " \"source-file\",\n", + " config={\n", + " \"dataset_name\": \"ModCloth Data\",\n", + " \"format\": \"jsonl\",\n", + " \"url\": \"https://datarepo.eng.ucsd.edu/mcauley_group/data/modcloth/modcloth_final_data.json.gz\",\n", + " \"provider\": {\n", + " \"storage\": \"HTTPS\",\n", + " },\n", + " \"reader_options\": json.dumps(\n", + " {\"compression\": \"Gzip\"}\n", + " ),\n", + " },\n", + ")\n", + "source.check()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see here, there is a `reader_options` that helps us to control how we are going to access the data. There are a lot of options for different file format that covers the common configurations.\n", + "Make sure you check the documentation for more detailed implementation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract the data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Read Progress\n", + "\n", + "Started reading at 22:15:59.\n", + "\n", + "Read **82,790** records over **14 seconds** (5,913.6 records / second).\n", + "\n", + "Wrote **82,790** records over 9 batches.\n", + "\n", + "Finished reading at 22:16:14.\n", + "\n", + "Started finalizing streams at 22:16:14.\n", + "\n", + "Finalized **9** batches over 0 seconds.\n", + "\n", + "Completed 1 out of 1 streams:\n", + "\n", + " - ModCloth Data\n", + "\n", + "\n", + "Completed writing at 22:16:14. Total time elapsed: 15 seconds\n", + "\n", + "\n", + "------------------------------------------------\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Completed `source-file` read operation at 05:16:14.\n",
+       "
\n" + ], + "text/plain": [ + "Completed `source-file` read operation at \u001b[1;92m05:16:14\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "source.select_all_streams()\n", + "read_result = source.read()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we are only interested on the reviews." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "reviews = [doc[\"review_text\"] for value in read_result.values() for doc in value if doc[\"review_text\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I liked the color, the silhouette, and the fabric of this dress. But the ruching just looked bunchy and ruined the whole thing. I was so disappointed, I really waned to like this dress. Runs a little small; I would need to size up to make it workappropriate.',\n", + " \"From the other reviews it seems like this dress either works for your body type or it doesn't. I have a small waist but flabby tummy and this dress is perfect for me! The detail around the front hides everything and the clingyness of the dress makes me look curvier than usual. The material is thick but clings to your bum (enough that when you walk the bum jiggle shows through!) and the slit is a bit high so it's not necessarily office appropriate without tights, but it's a good dress with tights or for an occasion.\",\n", + " \"I love the design and fit of this dress! I wore it to a wedding and was comfortable all evening. The color is really pretty in person too! The fabric quality seems decent but not great so I'm not sure how many washes it will make it through.\",\n", + " \"I bought this dress for work it is flattering and office appropriate. It hits just above my knees and I am pretty short at 5'1. Depending on how you adjust the top it can be a little low cut in the front, especially if you have a short torso. The material is on the thinner side, so should be great for summer/early fall and will work with tights underneath as well. I love it!\",\n", + " 'This is a very professional look. It is Great for work !']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reviews[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Pinecone Index and Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "pc = Pinecone(api_key=PINECONE_API_KEY)\n", + "\n", + "MODCLOTH_INDEX = \"modcloth-reviews-index\"\n", + "\n", + "\n", + "if MODCLOTH_INDEX not in pc.list_indexes().names():\n", + " pc.create_index(\n", + " name=MODCLOTH_INDEX, \n", + " dimension=1536, \n", + " metric='euclidean',\n", + " spec=ServerlessSpec(\n", + " cloud='aws',\n", + " region=PINECONE_ENVIRONMENT\n", + " )\n", + " )\n", + "\n", + " # wait for index to be initialized\n", + " while not pc.describe_index(MODCLOTH_INDEX).status['ready']:\n", + " time.sleep(1)\n", + "\n", + "\n", + "index = pc.Index(MODCLOTH_INDEX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Populate data for the vector store. For this demo purpose, we just load the first 100 reviews." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = PineconeVectorStore.from_texts(\n", + " reviews[:100], embeddings, index_name=MODCLOTH_INDEX\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RAG Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For below block of code, you can refer to this LangChain [documentation](https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/). We will just use it here:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "from langchain import hub\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "\n", + "retriever = vector_store.as_retriever()\n", + "prompt = hub.pull(\"rlm/rag-prompt\")\n", + "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0, api_key=OPENAI_API_KEY)\n", + "\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "rag_chain = (\n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. \"Cute dress! Very comfy\"\n", + "2. \"Cute dress! Very comfy\"\n", + "3. \"This was a great dress\"\n" + ] + } + ], + "source": [ + "print(rag_chain.invoke(\"Show 3 reviews text related with cute clothes\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "PyAirbyte `source-file` provides easy way for use to extract data from some file systems with varied formats. It also offers some flexibilities and options on how we want to extract the data, which is convenient." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyairbyte-hackathon", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 0b3ecfdc104992c39ecf6d120ed48b72d4659f83 Mon Sep 17 00:00:00 2001 From: Pangeran Bottor Date: Sat, 6 Jul 2024 09:23:03 +0700 Subject: [PATCH 2/3] move to correct folder --- ...rce_file_with_pyairbyte_and_pinecone.ipynb | 395 ++++++++++++++++++ 1 file changed, 395 insertions(+) create mode 100644 pyairbyte_notebooks/rag_source_file_with_pyairbyte_and_pinecone.ipynb diff --git a/pyairbyte_notebooks/rag_source_file_with_pyairbyte_and_pinecone.ipynb b/pyairbyte_notebooks/rag_source_file_with_pyairbyte_and_pinecone.ipynb new file mode 100644 index 0000000..cc5d938 --- /dev/null +++ b/pyairbyte_notebooks/rag_source_file_with_pyairbyte_and_pinecone.ipynb @@ -0,0 +1,395 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "This notebook demonstrates simple RAG (Retrieval-Augmented Generation) pipeline with Pinecone and PyAirbyte.\n", + "The focus is to showcase how to use `source-file` on PyAirbyte." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prerequisites\n", + "- [PyAirbyte](https://airbyte.com/product/pyairbyte)\n", + " \n", + " PyAirbyte is an open-source that packages Airbyte connectors and makes them available in Python. In this tutorial, we will use the \n", + " `source-file`\n", + "- [Pinecone](https://www.pinecone.io/)\n", + "- OpenAI API Key\n", + " \n", + " Go to the [API Keys page](https://platform.openai.com/api-keys) to create the new secret key." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Setup\n", + "\n", + "Install the dependencies and import them." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "%pip install airbyte langchain langchain-openai langchain-pinecone langchainhub openai pinecone-client" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import OpenAIEmbeddings\n", + "from langchain_pinecone import PineconeVectorStore\n", + "from pinecone import Pinecone, ServerlessSpec\n", + "\n", + "import json\n", + "import os\n", + "import time\n", + "\n", + "\n", + "import airbyte as ab\n", + "\n", + "\n", + "# Set your API keys\n", + "PINECONE_API_KEY = 'your-pinecone-key'\n", + "PINECONE_ENVIRONMENT = 'us-east-1' # e.g., \"us-west1-gcp\"\n", + "OPENAI_API_KEY = 'sk-proj-xxxx'\n", + "\n", + "api_key = os.environ.get(\"PINECONE_API_KEY\")\n", + "os.environ[\"PINECONE_API_KEY\"] = PINECONE_API_KEY" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Source\n", + "\n", + "For this quickstart purpose we will extract CSV data related to reviews on a clothing brand that being hosted publicly so no need to create any credentials. Find more details about the data: https://cseweb.ucsd.edu/~jmcauley/pdfs/recsys18e.pdf.\n", + "\n", + "In this quickstart we extract data with `JSONL` format and it's being compressed, we will see how it reflects on the config below.\n", + "You can find the documentation related to source file specification for Airbyte [here](https://docs.airbyte.com/integrations/sources/file).\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
Connection check succeeded for `source-file`.\n",
+       "
\n" + ], + "text/plain": [ + "Connection check succeeded for `source-file`.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "source = ab.get_source(\n", + " \"source-file\",\n", + " config={\n", + " \"dataset_name\": \"ModCloth Data\",\n", + " \"format\": \"jsonl\",\n", + " \"url\": \"https://datarepo.eng.ucsd.edu/mcauley_group/data/modcloth/modcloth_final_data.json.gz\",\n", + " \"provider\": {\n", + " \"storage\": \"HTTPS\",\n", + " },\n", + " \"reader_options\": json.dumps(\n", + " {\"compression\": \"Gzip\"}\n", + " ),\n", + " },\n", + ")\n", + "source.check()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "As we can see here, there is a `reader_options` that helps us to control how we are going to access the data. There are a lot of options for different file format that covers the common configurations.\n", + "Make sure you check the documentation for more detailed implementation." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Extract the data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/markdown": [ + "## Read Progress\n", + "\n", + "Started reading at 22:15:59.\n", + "\n", + "Read **82,790** records over **14 seconds** (5,913.6 records / second).\n", + "\n", + "Wrote **82,790** records over 9 batches.\n", + "\n", + "Finished reading at 22:16:14.\n", + "\n", + "Started finalizing streams at 22:16:14.\n", + "\n", + "Finalized **9** batches over 0 seconds.\n", + "\n", + "Completed 1 out of 1 streams:\n", + "\n", + " - ModCloth Data\n", + "\n", + "\n", + "Completed writing at 22:16:14. Total time elapsed: 15 seconds\n", + "\n", + "\n", + "------------------------------------------------\n" + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
Completed `source-file` read operation at 05:16:14.\n",
+       "
\n" + ], + "text/plain": [ + "Completed `source-file` read operation at \u001b[1;92m05:16:14\u001b[0m.\n" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "source.select_all_streams()\n", + "read_result = source.read()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Here, we are only interested on the reviews." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "reviews = [doc[\"review_text\"] for value in read_result.values() for doc in value if doc[\"review_text\"]]" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "['I liked the color, the silhouette, and the fabric of this dress. But the ruching just looked bunchy and ruined the whole thing. I was so disappointed, I really waned to like this dress. Runs a little small; I would need to size up to make it workappropriate.',\n", + " \"From the other reviews it seems like this dress either works for your body type or it doesn't. I have a small waist but flabby tummy and this dress is perfect for me! The detail around the front hides everything and the clingyness of the dress makes me look curvier than usual. The material is thick but clings to your bum (enough that when you walk the bum jiggle shows through!) and the slit is a bit high so it's not necessarily office appropriate without tights, but it's a good dress with tights or for an occasion.\",\n", + " \"I love the design and fit of this dress! I wore it to a wedding and was comfortable all evening. The color is really pretty in person too! The fabric quality seems decent but not great so I'm not sure how many washes it will make it through.\",\n", + " \"I bought this dress for work it is flattering and office appropriate. It hits just above my knees and I am pretty short at 5'1. Depending on how you adjust the top it can be a little low cut in the front, especially if you have a short torso. The material is on the thinner side, so should be great for summer/early fall and will work with tights underneath as well. I love it!\",\n", + " 'This is a very professional look. It is Great for work !']" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "reviews[:5]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Prepare Pinecone Index and Embedding" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "pc = Pinecone(api_key=PINECONE_API_KEY)\n", + "\n", + "MODCLOTH_INDEX = \"modcloth-reviews-index\"\n", + "\n", + "\n", + "if MODCLOTH_INDEX not in pc.list_indexes().names():\n", + " pc.create_index(\n", + " name=MODCLOTH_INDEX, \n", + " dimension=1536, \n", + " metric='euclidean',\n", + " spec=ServerlessSpec(\n", + " cloud='aws',\n", + " region=PINECONE_ENVIRONMENT\n", + " )\n", + " )\n", + "\n", + " # wait for index to be initialized\n", + " while not pc.describe_index(MODCLOTH_INDEX).status['ready']:\n", + " time.sleep(1)\n", + "\n", + "\n", + "index = pc.Index(MODCLOTH_INDEX)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Populate data for the vector store. For this demo purpose, we just load the first 100 reviews." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "vector_store = PineconeVectorStore.from_texts(\n", + " reviews[:100], embeddings, index_name=MODCLOTH_INDEX\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RAG Pipeline" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "For below block of code, you can refer to this LangChain [documentation](https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/). We will just use it here:" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "from langchain_openai import ChatOpenAI\n", + "from langchain import hub\n", + "from langchain_core.output_parsers import StrOutputParser\n", + "from langchain_core.runnables import RunnablePassthrough\n", + "\n", + "retriever = vector_store.as_retriever()\n", + "prompt = hub.pull(\"rlm/rag-prompt\")\n", + "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0, api_key=OPENAI_API_KEY)\n", + "\n", + "\n", + "def format_docs(docs):\n", + " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", + "\n", + "rag_chain = (\n", + " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", + " | prompt\n", + " | llm\n", + " | StrOutputParser()\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "1. \"Cute dress! Very comfy\"\n", + "2. \"Cute dress! Very comfy\"\n", + "3. \"This was a great dress\"\n" + ] + } + ], + "source": [ + "print(rag_chain.invoke(\"Show 3 reviews text related with cute clothes\"))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Summary" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "PyAirbyte `source-file` provides easy way for use to extract data from some file systems with varied formats. It also offers some flexibilities and options on how we want to extract the data, which is convenient." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "pyairbyte-hackathon", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} From 11695c80d8cafddc49ef7af74544842a4b6019ec Mon Sep 17 00:00:00 2001 From: Pangeran Bottor Date: Sat, 6 Jul 2024 09:24:12 +0700 Subject: [PATCH 3/3] move to correct folder --- ...rce_file_with_pyairbyte_and_pinecone.ipynb | 395 ------------------ 1 file changed, 395 deletions(-) delete mode 100644 rag_source_file_with_pyairbyte_and_pinecone.ipynb diff --git a/rag_source_file_with_pyairbyte_and_pinecone.ipynb b/rag_source_file_with_pyairbyte_and_pinecone.ipynb deleted file mode 100644 index cc5d938..0000000 --- a/rag_source_file_with_pyairbyte_and_pinecone.ipynb +++ /dev/null @@ -1,395 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "This notebook demonstrates simple RAG (Retrieval-Augmented Generation) pipeline with Pinecone and PyAirbyte.\n", - "The focus is to showcase how to use `source-file` on PyAirbyte." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prerequisites\n", - "- [PyAirbyte](https://airbyte.com/product/pyairbyte)\n", - " \n", - " PyAirbyte is an open-source that packages Airbyte connectors and makes them available in Python. In this tutorial, we will use the \n", - " `source-file`\n", - "- [Pinecone](https://www.pinecone.io/)\n", - "- OpenAI API Key\n", - " \n", - " Go to the [API Keys page](https://platform.openai.com/api-keys) to create the new secret key." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Setup\n", - "\n", - "Install the dependencies and import them." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "%pip install airbyte langchain langchain-openai langchain-pinecone langchainhub openai pinecone-client" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import OpenAIEmbeddings\n", - "from langchain_pinecone import PineconeVectorStore\n", - "from pinecone import Pinecone, ServerlessSpec\n", - "\n", - "import json\n", - "import os\n", - "import time\n", - "\n", - "\n", - "import airbyte as ab\n", - "\n", - "\n", - "# Set your API keys\n", - "PINECONE_API_KEY = 'your-pinecone-key'\n", - "PINECONE_ENVIRONMENT = 'us-east-1' # e.g., \"us-west1-gcp\"\n", - "OPENAI_API_KEY = 'sk-proj-xxxx'\n", - "\n", - "api_key = os.environ.get(\"PINECONE_API_KEY\")\n", - "os.environ[\"PINECONE_API_KEY\"] = PINECONE_API_KEY" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Source\n", - "\n", - "For this quickstart purpose we will extract CSV data related to reviews on a clothing brand that being hosted publicly so no need to create any credentials. Find more details about the data: https://cseweb.ucsd.edu/~jmcauley/pdfs/recsys18e.pdf.\n", - "\n", - "In this quickstart we extract data with `JSONL` format and it's being compressed, we will see how it reflects on the config below.\n", - "You can find the documentation related to source file specification for Airbyte [here](https://docs.airbyte.com/integrations/sources/file).\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
Connection check succeeded for `source-file`.\n",
-       "
\n" - ], - "text/plain": [ - "Connection check succeeded for `source-file`.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "source = ab.get_source(\n", - " \"source-file\",\n", - " config={\n", - " \"dataset_name\": \"ModCloth Data\",\n", - " \"format\": \"jsonl\",\n", - " \"url\": \"https://datarepo.eng.ucsd.edu/mcauley_group/data/modcloth/modcloth_final_data.json.gz\",\n", - " \"provider\": {\n", - " \"storage\": \"HTTPS\",\n", - " },\n", - " \"reader_options\": json.dumps(\n", - " {\"compression\": \"Gzip\"}\n", - " ),\n", - " },\n", - ")\n", - "source.check()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "As we can see here, there is a `reader_options` that helps us to control how we are going to access the data. There are a lot of options for different file format that covers the common configurations.\n", - "Make sure you check the documentation for more detailed implementation." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Extract the data" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "data": { - "text/markdown": [ - "## Read Progress\n", - "\n", - "Started reading at 22:15:59.\n", - "\n", - "Read **82,790** records over **14 seconds** (5,913.6 records / second).\n", - "\n", - "Wrote **82,790** records over 9 batches.\n", - "\n", - "Finished reading at 22:16:14.\n", - "\n", - "Started finalizing streams at 22:16:14.\n", - "\n", - "Finalized **9** batches over 0 seconds.\n", - "\n", - "Completed 1 out of 1 streams:\n", - "\n", - " - ModCloth Data\n", - "\n", - "\n", - "Completed writing at 22:16:14. Total time elapsed: 15 seconds\n", - "\n", - "\n", - "------------------------------------------------\n" - ], - "text/plain": [ - "" - ] - }, - "metadata": {}, - "output_type": "display_data" - }, - { - "data": { - "text/html": [ - "
Completed `source-file` read operation at 05:16:14.\n",
-       "
\n" - ], - "text/plain": [ - "Completed `source-file` read operation at \u001b[1;92m05:16:14\u001b[0m.\n" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "source.select_all_streams()\n", - "read_result = source.read()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Here, we are only interested on the reviews." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "reviews = [doc[\"review_text\"] for value in read_result.values() for doc in value if doc[\"review_text\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "['I liked the color, the silhouette, and the fabric of this dress. But the ruching just looked bunchy and ruined the whole thing. I was so disappointed, I really waned to like this dress. Runs a little small; I would need to size up to make it workappropriate.',\n", - " \"From the other reviews it seems like this dress either works for your body type or it doesn't. I have a small waist but flabby tummy and this dress is perfect for me! The detail around the front hides everything and the clingyness of the dress makes me look curvier than usual. The material is thick but clings to your bum (enough that when you walk the bum jiggle shows through!) and the slit is a bit high so it's not necessarily office appropriate without tights, but it's a good dress with tights or for an occasion.\",\n", - " \"I love the design and fit of this dress! I wore it to a wedding and was comfortable all evening. The color is really pretty in person too! The fabric quality seems decent but not great so I'm not sure how many washes it will make it through.\",\n", - " \"I bought this dress for work it is flattering and office appropriate. It hits just above my knees and I am pretty short at 5'1. Depending on how you adjust the top it can be a little low cut in the front, especially if you have a short torso. The material is on the thinner side, so should be great for summer/early fall and will work with tights underneath as well. I love it!\",\n", - " 'This is a very professional look. It is Great for work !']" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "reviews[:5]" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Prepare Pinecone Index and Embedding" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "embeddings = OpenAIEmbeddings(api_key=OPENAI_API_KEY)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "\n", - "pc = Pinecone(api_key=PINECONE_API_KEY)\n", - "\n", - "MODCLOTH_INDEX = \"modcloth-reviews-index\"\n", - "\n", - "\n", - "if MODCLOTH_INDEX not in pc.list_indexes().names():\n", - " pc.create_index(\n", - " name=MODCLOTH_INDEX, \n", - " dimension=1536, \n", - " metric='euclidean',\n", - " spec=ServerlessSpec(\n", - " cloud='aws',\n", - " region=PINECONE_ENVIRONMENT\n", - " )\n", - " )\n", - "\n", - " # wait for index to be initialized\n", - " while not pc.describe_index(MODCLOTH_INDEX).status['ready']:\n", - " time.sleep(1)\n", - "\n", - "\n", - "index = pc.Index(MODCLOTH_INDEX)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Populate data for the vector store. For this demo purpose, we just load the first 100 reviews." - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "vector_store = PineconeVectorStore.from_texts(\n", - " reviews[:100], embeddings, index_name=MODCLOTH_INDEX\n", - ")" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### RAG Pipeline" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For below block of code, you can refer to this LangChain [documentation](https://python.langchain.com/v0.1/docs/use_cases/question_answering/quickstart/). We will just use it here:" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "from langchain_openai import ChatOpenAI\n", - "from langchain import hub\n", - "from langchain_core.output_parsers import StrOutputParser\n", - "from langchain_core.runnables import RunnablePassthrough\n", - "\n", - "retriever = vector_store.as_retriever()\n", - "prompt = hub.pull(\"rlm/rag-prompt\")\n", - "llm = ChatOpenAI(model_name=\"gpt-3.5-turbo\", temperature=0, api_key=OPENAI_API_KEY)\n", - "\n", - "\n", - "def format_docs(docs):\n", - " return \"\\n\\n\".join(doc.page_content for doc in docs)\n", - "\n", - "rag_chain = (\n", - " {\"context\": retriever | format_docs, \"question\": RunnablePassthrough()}\n", - " | prompt\n", - " | llm\n", - " | StrOutputParser()\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "1. \"Cute dress! Very comfy\"\n", - "2. \"Cute dress! Very comfy\"\n", - "3. \"This was a great dress\"\n" - ] - } - ], - "source": [ - "print(rag_chain.invoke(\"Show 3 reviews text related with cute clothes\"))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Summary" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "PyAirbyte `source-file` provides easy way for use to extract data from some file systems with varied formats. It also offers some flexibilities and options on how we want to extract the data, which is convenient." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "pyairbyte-hackathon", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.11.0" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}