Modified the Structure of the project

AnasAber · Aug 2, 2024 · 3ab5597 · 3ab5597
1 parent 1437c26
commit 3ab5597
Show file tree

Hide file tree

Showing 29 changed files with 571 additions and 185 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,4 +1,5 @@
 .env
 FlagEmbedding-1.2.10/
-rag/
 testing.py
+t.py
+s.txt
diff --git a/README.md b/README.md
@@ -26,6 +26,18 @@ Retrieved chunks are reranked using the Cohere API to ensure the most relevant c
 
 The top-ranked chunks are passed to the Llama model (via Groq API) to generate a coherent and relevant response.
 
+### How to start
+
+1. Clone the repository
+    `git clone https://github.com/AnasAber/RAG_in_CPU.git`
+2. Activate the virtual environement:
+    for Windows: `./rag/Scripts/activate`
+    for MacOS/Linux: `source venv/bin/activate` 
+3. Run the `app.py` file
+
+The reason why I'm using a virtual environment is to avoid any conflicts with the dependencies (I had to manually change things in configuration files), and to make sure that the project runs smoothly.
+
+
 This project's RAG uses semantic search using ChromaDB, I'll work on doing a combination of Hybrid Search and a HyDE following the best practices of RAG mentioned in the following paper: [link](https://arxiv.org/html/2407.01219v1#:~:text=A%20typical%20RAG%20workflow%20usually,based%20on%20their%20relevance%20to)
 
 ![System Architecture Diagram](images/x1.png)

diff --git a/__pycache__/chroma_search_functions.cpython-310.pyc b/__pycache__/chroma_search_functions.cpython-310.pyc
diff --git a/__pycache__/get_embeddings.cpython-310.pyc b/__pycache__/get_embeddings.cpython-310.pyc
diff --git a/query_data.py → app.py b/query_data.py → app.py
@@ -1,23 +1,17 @@
-import get_embeddings
-import chroma_search_functions as csf
+import src.data_processing.get_embeddings
+from data.process_data import load_documents, embed_and_store_documents, split_documents
 from langchain.prompts import ChatPromptTemplate
-from transformers import pipeline
-from langchain_community.vectorstores import Chroma
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-import transformers
-from langchain.llms import HuggingFacePipeline
 from groq import Groq
 import os
+from src.database.chroma_search_functions import get_relevant_data
 
 """
     Importing the functions and setting up the environment variables
 
 """
 
 CHROMA_PATH = "chroma/"
-DATA_PATH = "data"
-
-(load_documents, split_documents, embed_and_store_documents, retrieve_documents, get_relevant_data, add_to_chroma_db, get_chroma_db) = csf.main()
+DATA_PATH = "data/raw"
 
 client = Groq(
     api_key=os.getenv("GROQ_API_KEY"),
@@ -27,6 +21,11 @@
 """
 Again, if we want to load a huggingFace model and tokenizer, we can do it like this:
 
+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import transformers
+from langchain.llms import HuggingFacePipeline
+
 model_name = "microsoft/Phi-3-mini-4k-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
@@ -53,18 +52,38 @@
 def format_context(context):
     return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)])
 
+def check_and_process_documents():
+    path = "data/processed/chroma"
+    print(f"Checking if path exists: {path}")
+
+    if not os.path.exists(path):
+        print(f"Path does not exist: {path}")
+
+        documents = load_documents()
+        print("Documents loaded")
+
+        chunks = split_documents(documents)
+        print("Documents split into chunks")
+
+        embed_and_store_documents(chunks)
+        print("Documents embedded and stored")
+    else:
+        print(f"Path already exists: {path}")
+
 def main():
 
     """
     loading documents should be performed only once, ti will take a bit of time at first.
     You can comment them out as chromaDB has the infos already
     
     """
+    check_and_process_documents()
 
-    documents = load_documents()
-    chunks = split_documents(documents)
-    embed_and_store_documents(chunks)
-    print("Documents loaded, split, and stored")
+    if not os.path.exists("data/processed/chroma"):
+        documents = load_documents()
+        chunks = split_documents(documents)
+        embed_and_store_documents(chunks)
+        print("Documents loaded, split, and stored")
 
 
 

diff --git a/chroma_search_functions.py b/chroma_search_functions.py
diff --git a/data/__pycache__/process_data.cpython-310.pyc b/data/__pycache__/process_data.cpython-310.pyc
diff --git a/data/process_data.py b/data/process_data.py
@@ -0,0 +1,53 @@
+from langchain_community.vectorstores import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain.schema.document import Document
+import os
+from src.data_processing.get_embeddings import get_embeddings
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+"""
+    Initializating the APIs and setting up the environment variables
+
+"""
+
+api_key = os.getenv("COHERE_API_KEY")
+
+CHROMA_PATH = "data/processed/chroma"
+DATA_PATH = "data"
+
+
+def load_documents():
+    document_loader = PyPDFDirectoryLoader("data/")
+    print("Loading documents...")
+    return document_loader.load()
+
+
+def split_documents(documents):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=100,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    docs = []
+    print("Splitting documents...")
+    for document in documents:
+        for chunk in text_splitter.split_text(document.page_content):
+            docs.append(Document(page_content=chunk, metadata={"source": document.metadata["source"]}))
+    print("Documents split successfully.")
+    return docs
+
+
+def embed_and_store_documents(chunks):
+    chroma_db = Chroma(
+    persist_directory=CHROMA_PATH, embedding_function=get_embeddings()
+    )
+    print("Storing documents...")
+    chroma_db.add_documents(chunks, persist_directory=CHROMA_PATH,embeddings=get_embeddings())
+    chroma_db.persist()
+    print("Documents stored successfully.")
+
diff --git a/...00-4375-8363-fca32070ff78/data_level0.bin → ...b1-487f-afee-13fec0ce30f7/data_level0.bin b/...00-4375-8363-fca32070ff78/data_level0.bin → ...b1-487f-afee-13fec0ce30f7/data_level0.bin
diff --git a/...b4-2400-4375-8363-fca32070ff78/header.bin → ...0f-70b1-487f-afee-13fec0ce30f7/header.bin b/...b4-2400-4375-8363-fca32070ff78/header.bin → ...0f-70b1-487f-afee-13fec0ce30f7/header.bin
diff --git a/...b4-2400-4375-8363-fca32070ff78/length.bin → ...0f-70b1-487f-afee-13fec0ce30f7/length.bin b/...b4-2400-4375-8363-fca32070ff78/length.bin → ...0f-70b1-487f-afee-13fec0ce30f7/length.bin
diff --git a/...400-4375-8363-fca32070ff78/link_lists.bin → ...0b1-487f-afee-13fec0ce30f7/link_lists.bin b/...400-4375-8363-fca32070ff78/link_lists.bin → ...0b1-487f-afee-13fec0ce30f7/link_lists.bin
diff --git a/chroma/chroma.sqlite3 → data/processed/chroma/chroma.sqlite3 b/chroma/chroma.sqlite3 → data/processed/chroma/chroma.sqlite3
diff --git a/data/ReAct for LLMs.pdf → data/raw/ReAct for LLMs.pdf b/data/ReAct for LLMs.pdf → data/raw/ReAct for LLMs.pdf
diff --git a/data/generative ai.pdf → data/raw/generative ai.pdf b/data/generative ai.pdf → data/raw/generative ai.pdf
diff --git a/data/monopoly.pdf → data/raw/monopoly.pdf b/data/monopoly.pdf → data/raw/monopoly.pdf