diff --git a/.gitignore b/.gitignore
index c4c8c5c..eb3abb3 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 .env
 FlagEmbedding-1.2.10/
-rag/
 testing.py
+t.py
+s.txt
diff --git a/README.md b/README.md
index 244b466..6104d46 100644
--- a/README.md
+++ b/README.md
@@ -26,6 +26,18 @@ Retrieved chunks are reranked using the Cohere API to ensure the most relevant c
 
 The top-ranked chunks are passed to the Llama model (via Groq API) to generate a coherent and relevant response.
 
+### How to start
+
+1. Clone the repository
+    `git clone https://github.com/AnasAber/RAG_in_CPU.git`
+2. Activate the virtual environement:
+    for Windows: `./rag/Scripts/activate`
+    for MacOS/Linux: `source venv/bin/activate` 
+3. Run the `app.py` file
+
+The reason why I'm using a virtual environment is to avoid any conflicts with the dependencies (I had to manually change things in configuration files), and to make sure that the project runs smoothly.
+
+
 This project's RAG uses semantic search using ChromaDB, I'll work on doing a combination of Hybrid Search and a HyDE following the best practices of RAG mentioned in the following paper: [link](https://arxiv.org/html/2407.01219v1#:~:text=A%20typical%20RAG%20workflow%20usually,based%20on%20their%20relevance%20to)
 
 ![System Architecture Diagram](images/x1.png)
diff --git a/__pycache__/chroma_search_functions.cpython-310.pyc b/__pycache__/chroma_search_functions.cpython-310.pyc
index 5c91055..66abf60 100644
Binary files a/__pycache__/chroma_search_functions.cpython-310.pyc and b/__pycache__/chroma_search_functions.cpython-310.pyc differ
diff --git a/__pycache__/get_embeddings.cpython-310.pyc b/__pycache__/get_embeddings.cpython-310.pyc
index 8d2eb50..4d4ff5e 100644
Binary files a/__pycache__/get_embeddings.cpython-310.pyc and b/__pycache__/get_embeddings.cpython-310.pyc differ
diff --git a/query_data.py b/app.py
similarity index 79%
rename from query_data.py
rename to app.py
index 8cdc067..9fd454a 100644
--- a/query_data.py
+++ b/app.py
@@ -1,13 +1,9 @@
-import get_embeddings
-import chroma_search_functions as csf
+import src.data_processing.get_embeddings
+from data.process_data import load_documents, embed_and_store_documents, split_documents
 from langchain.prompts import ChatPromptTemplate
-from transformers import pipeline
-from langchain_community.vectorstores import Chroma
-from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
-import transformers
-from langchain.llms import HuggingFacePipeline
 from groq import Groq
 import os
+from src.database.chroma_search_functions import get_relevant_data
 
 """
     Importing the functions and setting up the environment variables
@@ -15,9 +11,7 @@
 """
 
 CHROMA_PATH = "chroma/"
-DATA_PATH = "data"
-
-(load_documents, split_documents, embed_and_store_documents, retrieve_documents, get_relevant_data, add_to_chroma_db, get_chroma_db) = csf.main()
+DATA_PATH = "data/raw"
 
 client = Groq(
     api_key=os.getenv("GROQ_API_KEY"),
@@ -27,6 +21,11 @@
 """
 Again, if we want to load a huggingFace model and tokenizer, we can do it like this:
 
+from transformers import pipeline
+from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
+import transformers
+from langchain.llms import HuggingFacePipeline
+
 model_name = "microsoft/Phi-3-mini-4k-instruct"
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 model = AutoModelForCausalLM.from_pretrained(
@@ -53,6 +52,24 @@
 def format_context(context):
     return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)])
 
+def check_and_process_documents():
+    path = "data/processed/chroma"
+    print(f"Checking if path exists: {path}")
+    
+    if not os.path.exists(path):
+        print(f"Path does not exist: {path}")
+        
+        documents = load_documents()
+        print("Documents loaded")
+        
+        chunks = split_documents(documents)
+        print("Documents split into chunks")
+        
+        embed_and_store_documents(chunks)
+        print("Documents embedded and stored")
+    else:
+        print(f"Path already exists: {path}")
+
 def main():
 
     """
@@ -60,11 +77,13 @@ def main():
     You can comment them out as chromaDB has the infos already
     
     """
+    check_and_process_documents()
 
-    documents = load_documents()
-    chunks = split_documents(documents)
-    embed_and_store_documents(chunks)
-    print("Documents loaded, split, and stored")
+    if not os.path.exists("data/processed/chroma"):
+        documents = load_documents()
+        chunks = split_documents(documents)
+        embed_and_store_documents(chunks)
+        print("Documents loaded, split, and stored")
     
 
 
diff --git a/chroma_search_functions.py b/chroma_search_functions.py
deleted file mode 100644
index 3f3a680..0000000
--- a/chroma_search_functions.py
+++ /dev/null
@@ -1,154 +0,0 @@
-from langchain_community.vectorstores import Chroma
-from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.document_loaders import PyPDFDirectoryLoader
-from langchain.schema.document import Document
-# from FlagEmbedding.flag_models import FlagModel
-# from FlagEmbedding.flag_reranker import FlagReranker
-from get_embeddings import get_embeddings
-import uuid
-import os
-from dotenv import load_dotenv
-import cohere
-
-load_dotenv()
-
-"""
-    Initializating the APIs and setting up the environment variables
-
-"""
-
-api_key = os.getenv("COHERE_API_KEY")
-
-CHROMA_PATH = "chroma"
-DATA_PATH = "data"
-
-# init client
-co = cohere.Client(api_key=api_key)
-
-
-
-
-# load the data
-def get_chroma_db(get_embeddings=get_embeddings):
-    return Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embeddings())
-
-
-def main():
-
-
-    def load_documents():
-        document_loader = PyPDFDirectoryLoader("data/")
-        print("Loading documents...")
-        return document_loader.load()
-    
-
-    def split_documents(documents):
-        text_splitter = RecursiveCharacterTextSplitter(
-            chunk_size=500,
-            chunk_overlap=100,
-            length_function=len,
-            is_separator_regex=False,
-        )
-        docs = []
-        print("Splitting documents...")
-        for document in documents:
-            for chunk in text_splitter.split_text(document.page_content):
-                docs.append(Document(page_content=chunk, metadata={"source": document.metadata["source"]}))
-        print("Documents split successfully.")
-        return docs
-
-    
-    def embed_and_store_documents(chunks):
-        chroma_db = Chroma(
-        persist_directory=CHROMA_PATH, embedding_function=get_embeddings()
-        )
-        print("Storing documents...")
-        chroma_db.add_documents(chunks, persist_directory=CHROMA_PATH,embeddings=get_embeddings())
-        chroma_db.persist()
-        print("Documents stored successfully.")
-
-
-    
-    def retrieve_documents(query, top_k=5):
-        chroma_db = get_chroma_db()
-        print("#"*100 + "\n\n")
-
-        print("Retrieving documents...")
-        results = chroma_db.similarity_search_with_score(query, top_k)
-        context_text= "\n\n---\n\n".join([doc.page_content for doc, _score in results])
-
-        print("Documents before reranking: ", context_text)
-
-        return context_text
-    
-
-    """
-    If you want to use the FlagReranker to rerank the retrieved documents, you can use the following code snippet:
-
-        reranker = FlagModel("BAAI/bge-reranker-v2-m3", use_fp16=True)
-
-        def reranked_documents(query, retrieved_chunks, top_k=3):
-            reranked_chunks = reranker.predict(query, retrieved_chunks)
-            return [chunk for chunk, _ in reranked_chunks[:top_k]]
-        
-        Initialize the FlagReranker
-        reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
-        
-        
-    I'll personally use the cohere API to rerank the documents.
-    """
-
-
-    def format_context(context):
-        return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)])
-
-
-    def reranked_documents(query, long_string, top_k=3):
-        # Split the long string into individual chunks using '\n\n---\n\n' as the separator
-        chunks = long_string.split("\n\n---\n\n")
-
-        # Ensure all chunks are valid (non-empty) and strip leading/trailing whitespace
-        valid_chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
-
-        if not valid_chunks:
-            print("No valid chunks to rerank.")
-            return []
-
-        # Use the cohere rerank API
-        rerank_docs = co.rerank(
-            query=query,
-            documents=valid_chunks,
-            top_n=top_k,
-            model="rerank-english-v2.0"
-        )
-
-        print("#"*100 + "\n\n")
-        # Extract and print reranked chunks using the indices from the rerank response
-        reranked_chunks = [valid_chunks[result.index] for result in rerank_docs.results]
-        print("Reranked Chunks:\n\n", format_context(reranked_chunks))
-
-        return reranked_chunks
-        
-
-    def get_relevant_data(query):
-        retrieved_chunks = retrieve_documents(query)
-        reranked_chunks = reranked_documents(query, retrieved_chunks)
-        return reranked_chunks
-    
-
-    
-    def add_to_chroma_db(reranked_chunks):
-        chroma_db = get_chroma_db()
-        chroma_db.add_documents(reranked_chunks)
-        chroma_db.persist()
-
-
-    return load_documents, split_documents, embed_and_store_documents, retrieve_documents, get_relevant_data, add_to_chroma_db, get_chroma_db
-    
-
-if __name__ == "__main__":
-    main()
-    
-
-
-
diff --git a/data/__pycache__/process_data.cpython-310.pyc b/data/__pycache__/process_data.cpython-310.pyc
new file mode 100644
index 0000000..b79cb65
Binary files /dev/null and b/data/__pycache__/process_data.cpython-310.pyc differ
diff --git a/data/process_data.py b/data/process_data.py
new file mode 100644
index 0000000..5c4b010
--- /dev/null
+++ b/data/process_data.py
@@ -0,0 +1,53 @@
+from langchain_community.vectorstores import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain.schema.document import Document
+import os
+from src.data_processing.get_embeddings import get_embeddings
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+"""
+    Initializating the APIs and setting up the environment variables
+
+"""
+
+api_key = os.getenv("COHERE_API_KEY")
+
+CHROMA_PATH = "data/processed/chroma"
+DATA_PATH = "data"
+
+
+def load_documents():
+    document_loader = PyPDFDirectoryLoader("data/")
+    print("Loading documents...")
+    return document_loader.load()
+
+
+def split_documents(documents):
+    text_splitter = RecursiveCharacterTextSplitter(
+        chunk_size=500,
+        chunk_overlap=100,
+        length_function=len,
+        is_separator_regex=False,
+    )
+    docs = []
+    print("Splitting documents...")
+    for document in documents:
+        for chunk in text_splitter.split_text(document.page_content):
+            docs.append(Document(page_content=chunk, metadata={"source": document.metadata["source"]}))
+    print("Documents split successfully.")
+    return docs
+
+
+def embed_and_store_documents(chunks):
+    chroma_db = Chroma(
+    persist_directory=CHROMA_PATH, embedding_function=get_embeddings()
+    )
+    print("Storing documents...")
+    chroma_db.add_documents(chunks, persist_directory=CHROMA_PATH,embeddings=get_embeddings())
+    chroma_db.persist()
+    print("Documents stored successfully.")
+
diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/data_level0.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/data_level0.bin
similarity index 100%
rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/data_level0.bin
rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/data_level0.bin
diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/header.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/header.bin
similarity index 100%
rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/header.bin
rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/header.bin
diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/length.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/length.bin
similarity index 98%
rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/length.bin
rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/length.bin
index 1dc89f8..f5fee2d 100644
Binary files a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/length.bin and b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/length.bin differ
diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/link_lists.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/link_lists.bin
similarity index 100%
rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/link_lists.bin
rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/link_lists.bin
diff --git a/chroma/chroma.sqlite3 b/data/processed/chroma/chroma.sqlite3
similarity index 85%
rename from chroma/chroma.sqlite3
rename to data/processed/chroma/chroma.sqlite3
index 8c82ad4..551ecd7 100644
Binary files a/chroma/chroma.sqlite3 and b/data/processed/chroma/chroma.sqlite3 differ
diff --git a/data/ReAct for LLMs.pdf b/data/raw/ReAct for LLMs.pdf
similarity index 100%
rename from data/ReAct for LLMs.pdf
rename to data/raw/ReAct for LLMs.pdf
diff --git a/data/generative ai.pdf b/data/raw/generative ai.pdf
similarity index 100%
rename from data/generative ai.pdf
rename to data/raw/generative ai.pdf
diff --git a/data/monopoly.pdf b/data/raw/monopoly.pdf
similarity index 100%
rename from data/monopoly.pdf
rename to data/raw/monopoly.pdf
diff --git a/requirements.txt b/requirements.txt
index d1aad4c..6eb2b21 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,17 +1,163 @@
-pypdf
+accelerate
+aiohttp
+annotated-types
+anyio
+asgiref
+async-timeout
+attrs
+backoff
+bcrypt
+boto3
+botocore
+build
+cachetools
+certifi
+charset-normalizer
+chroma-hnswlib
+chromadb
+click
+cohere
+colorama
+coloredlogs
+dataclasses-json
+datasets
+Deprecated
+dill
+distro
+dnspython
+einops
+email_validator
+exceptiongroup
+fastapi
+fastapi-cli
+fastavro
+filelock
+FlagEmbedding
+flatbuffers
+frozenlist
+fsspec
+google-auth
+googleapis-common-protos
+greenlet
+groq
+grpcio
+h11
+httpcore
+httptools
+httpx
+httpx-sse
+huggingface-hub
+humanfriendly
+idna
+importlib_metadata
+importlib_resources
+iniconfig
+intel-openmp
+Jinja2
+jmespath
+joblib
+jsonpatch
+jsonpointer
+kubernetes
 langchain
 langchain-community
-chromadb
+langchain-core
+langchain-text-splitters
+langsmith
+markdown-it-py
+MarkupSafe
+marshmallow
+mdurl
+mkl
+mmh3
+monotonic
+mpmath
+multidict
+multiprocess
+mypy-extensions
+networkx
 numpy
-scikit-learn
+oauthlib
+onnx
+onnxruntime
+opentelemetry-api
+opentelemetry-exporter-otlp-proto-common
+opentelemetry-exporter-otlp-proto-grpc
+opentelemetry-instrumentation
+opentelemetry-instrumentation-asgi
+opentelemetry-instrumentation-fastapi
+opentelemetry-proto
+opentelemetry-sdk
+opentelemetry-semantic-conventions
+opentelemetry-util-http
+orjson
+overrides
+packaging
+pandas
+parameterized
+pillow
+pip
+pluggy
+posthog
+protobuf
+psutil
+pyarrow
+pyarrow-hotfix
+pyasn1
+pyasn1_modules
+pydantic
+pydantic_core
+Pygments
+pypdf
+PyPika
+pyproject_hooks
+pyreadline3
 pytest
+python-dateutil
 python-dotenv
-transformers
-FlagEmbedding
-einops
-joblib
+python-multipart
+pytz
+PyYAML
+regex
+requests
+requests-oauthlib
+rich
+rsa
+s3transfer
+safetensors
+scikit-learn
+scipy
+sentence-transformers
+setuptools
+shellingham
+six
+sniffio
+SQLAlchemy
+starlette
+sympy
+tbb
+tenacity
+threadpoolctl
+tokenizers
+tomli
 torch
-onnx
-onnxruntime
-cohere
-groq
\ No newline at end of file
+torchaudio
+torchvision
+tqdm
+transformers
+typer
+types-requests
+typing_extensions
+typing-inspect
+tzdata
+ujson
+urllib3
+uvicorn
+watchfiles
+websocket-client
+websockets
+wheel
+wrapt
+xxhash
+yarl
+zipp
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..42ba147
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,174 @@
+from setuptools import setup, find_packages
+
+setup(
+    name='RAG_for_CPU',
+    version='1.0',
+    author='Anas Aberchih',
+    author_email='anas.aberchih1@gmail.com',
+    packages=find_packages(),
+    install_requires=[
+        'accelerate',
+        'aiohttp',
+        'annotated-types',
+        'anyio',
+        'asgiref',
+        'async-timeout',
+        'attrs',
+        'backoff',
+        'bcrypt',
+        'boto3',
+        'botocore',
+        'build',
+        'cachetools',
+        'certifi',
+        'charset-normalizer',
+        'chroma-hnswlib',
+        'chromadb',
+        'click',
+        'cohere',
+        'colorama',
+        'coloredlogs',
+        'dataclasses-json',
+        'datasets',
+        'Deprecated',
+        'dill',
+        'distro',
+        'dnspython',
+        'einops',
+        'email_validator',
+        'exceptiongroup',
+        'fastapi',
+        'fastapi-cli',
+        'fastavro',
+        'filelock',
+        'FlagEmbedding',
+        'flatbuffers',
+        'frozenlist',
+        'fsspec',
+        'google-auth',
+        'googleapis-common-protos',
+        'greenlet',
+        'groq',
+        'grpcio',
+        'h11',
+        'httpcore',
+        'httptools',
+        'httpx',
+        'httpx-sse',
+        'huggingface-hub',
+        'humanfriendly',
+        'idna',
+        'importlib_metadata',
+        'importlib_resources',
+        'iniconfig',
+        'intel-openmp',
+        'Jinja2',
+        'jmespath',
+        'joblib',
+        'jsonpatch',
+        'jsonpointer',
+        'kubernetes',
+        'langchain',
+        'langchain-community',
+        'langchain-core',
+        'langchain-text-splitters',
+        'langsmith',
+        'markdown-it-py',
+        'MarkupSafe',
+        'marshmallow',
+        'mdurl',
+        'mkl',
+        'mmh3',
+        'monotonic',
+        'mpmath',
+        'multidict',
+        'multiprocess',
+        'mypy-extensions',
+        'networkx',
+        'numpy',
+        'oauthlib',
+        'onnx',
+        'onnxruntime',
+        'opentelemetry-api',
+        'opentelemetry-exporter-otlp-proto-common',
+        'opentelemetry-exporter-otlp-proto-grpc',
+        'opentelemetry-instrumentation',
+        'opentelemetry-instrumentation-asgi',
+        'opentelemetry-instrumentation-fastapi',
+        'opentelemetry-proto',
+        'opentelemetry-sdk',
+        'opentelemetry-semantic-conventions',
+        'opentelemetry-util-http',
+        'orjson',
+        'overrides',
+        'packaging',
+        'pandas',
+        'parameterized',
+        'pillow',
+        'pip',
+        'pluggy',
+        'posthog',
+        'protobuf',
+        'psutil',
+        'pyarrow',
+        'pyarrow-hotfix',
+        'pyasn1',
+        'pyasn1_modules',
+        'pydantic',
+        'pydantic_core',
+        'Pygments',
+        'pypdf',
+        'PyPika',
+        'pyproject_hooks',
+        'pyreadline3',
+        'pytest',
+        'python-dateutil',
+        'python-dotenv',
+        'python-multipart',
+        'pytz',
+        'PyYAML',
+        'regex',
+        'requests',
+        'requests-oauthlib',
+        'rich',
+        'rsa',
+        's3transfer',
+        'safetensors',
+        'scikit-learn',
+        'scipy',
+        'sentence-transformers',
+        'setuptools',
+        'shellingham',
+        'six',
+        'sniffio',
+        'SQLAlchemy',
+        'starlette',
+        'sympy',
+        'tbb',
+        'tenacity',
+        'threadpoolctl',
+        'tokenizers',
+        'tomli',
+        'torch',
+        'torchaudio',
+        'torchvision',
+        'tqdm',
+        'transformers',
+        'typer',
+        'types-requests',
+        'typing_extensions',
+        'typing-inspect',
+        'tzdata',
+        'ujson',
+        'urllib3',
+        'uvicorn',
+        'watchfiles',
+        'websocket-client',
+        'websockets',
+        'wheel',
+        'wrapt',
+        'xxhash',
+        'yarl',
+        'zipp',
+    ],
+)
diff --git a/src/data_processing/__init__.py b/src/data_processing/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/src/data_processing/__pycache__/__init__.cpython-310.pyc b/src/data_processing/__pycache__/__init__.cpython-310.pyc
new file mode 100644
index 0000000..8c5dbd5
Binary files /dev/null and b/src/data_processing/__pycache__/__init__.cpython-310.pyc differ
diff --git a/src/data_processing/__pycache__/get_embeddings.cpython-310.pyc b/src/data_processing/__pycache__/get_embeddings.cpython-310.pyc
new file mode 100644
index 0000000..f3788d2
Binary files /dev/null and b/src/data_processing/__pycache__/get_embeddings.cpython-310.pyc differ
diff --git a/get_embeddings.py b/src/data_processing/get_embeddings.py
similarity index 94%
rename from get_embeddings.py
rename to src/data_processing/get_embeddings.py
index 6657139..916df65 100644
--- a/get_embeddings.py
+++ b/src/data_processing/get_embeddings.py
@@ -33,10 +33,5 @@ def get_embeddings(text=None):
     embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1")
     return embeddings
 
-__all__ = ['get_embeddings', 'get_embeddings_query']
-
-
-
-
 # (1, 8, 1024)
 
diff --git a/src/database/__pycache__/chroma_search_functions.cpython-310.pyc b/src/database/__pycache__/chroma_search_functions.cpython-310.pyc
new file mode 100644
index 0000000..f675943
Binary files /dev/null and b/src/database/__pycache__/chroma_search_functions.cpython-310.pyc differ
diff --git a/src/database/chroma_search_functions.py b/src/database/chroma_search_functions.py
new file mode 100644
index 0000000..6a887d4
--- /dev/null
+++ b/src/database/chroma_search_functions.py
@@ -0,0 +1,114 @@
+from langchain_community.vectorstores import Chroma
+from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.document_loaders import PyPDFDirectoryLoader
+from langchain.schema.document import Document
+# from FlagEmbedding.flag_models import FlagModel
+# from FlagEmbedding.flag_reranker import FlagReranker
+from src.data_processing.get_embeddings import get_embeddings
+import uuid
+import os
+from dotenv import load_dotenv
+import cohere
+
+load_dotenv()
+
+"""
+    Initializating the APIs and setting up the environment variables
+
+"""
+
+api_key = os.getenv("COHERE_API_KEY")
+
+CHROMA_PATH = "data/processed/chroma"
+
+# init client
+co = cohere.Client(api_key)
+
+
+
+
+# load the data
+def get_chroma_db(get_embeddings=get_embeddings):
+    return Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embeddings())
+
+
+
+def retrieve_documents(query, top_k=5):
+    chroma_db = get_chroma_db()
+    print("#"*100 + "\n\n")
+
+    print("Retrieving documents...")
+    results = chroma_db.similarity_search_with_score(query, top_k)
+    context_text= "\n\n---\n\n".join([doc.page_content for doc, _score in results])
+
+    print("Documents before reranking: ", context_text)
+
+    return context_text
+
+
+"""
+If you want to use the FlagReranker to rerank the retrieved documents, you can use the following code snippet:
+
+    reranker = FlagModel("BAAI/bge-reranker-v2-m3", use_fp16=True)
+
+    def reranked_documents(query, retrieved_chunks, top_k=3):
+        reranked_chunks = reranker.predict(query, retrieved_chunks)
+        return [chunk for chunk, _ in reranked_chunks[:top_k]]
+    
+    Initialize the FlagReranker
+    reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True)
+    
+    
+I'll personally use the cohere API to rerank the documents.
+"""
+
+
+def format_context(context):
+    return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)])
+
+
+def reranked_documents(query, long_string, top_k=3):
+    # Split the long string into individual chunks using '\n\n---\n\n' as the separator
+    chunks = long_string.split("\n\n---\n\n")
+
+    # Ensure all chunks are valid (non-empty) and strip leading/trailing whitespace
+    valid_chunks = [chunk.strip() for chunk in chunks if chunk.strip()]
+
+    if not valid_chunks:
+        print("No valid chunks to rerank.")
+        return []
+
+    # Use the cohere rerank API
+    rerank_docs = co.rerank(
+        query=query,
+        documents=valid_chunks,
+        top_n=top_k,
+        model="rerank-english-v2.0"
+    )
+
+    print("#"*100 + "\n\n")
+    # Extract and print reranked chunks using the indices from the rerank response
+    reranked_chunks = [valid_chunks[result.index] for result in rerank_docs.results]
+    print("Reranked Chunks:\n\n", format_context(reranked_chunks))
+
+    return reranked_chunks
+    
+
+def get_relevant_data(query):
+    retrieved_chunks = retrieve_documents(query)
+    reranked_chunks = reranked_documents(query, retrieved_chunks)
+    return reranked_chunks
+
+
+
+def add_to_chroma_db(reranked_chunks):
+    chroma_db = get_chroma_db()
+    chroma_db.add_documents(reranked_chunks)
+    chroma_db.persist()
+
+
+
+
+
+
+
diff --git a/images/RAG_in_CPU.gif b/src/images/RAG_in_CPU.gif
similarity index 100%
rename from images/RAG_in_CPU.gif
rename to src/images/RAG_in_CPU.gif
diff --git a/images/x1.png b/src/images/x1.png
similarity index 100%
rename from images/x1.png
rename to src/images/x1.png
diff --git a/src/processed/chroma/chroma.sqlite3 b/src/processed/chroma/chroma.sqlite3
new file mode 100644
index 0000000..87d0279
Binary files /dev/null and b/src/processed/chroma/chroma.sqlite3 differ
diff --git a/embedding_model_choice.py b/src/tests/embedding_model_choice.py
similarity index 100%
rename from embedding_model_choice.py
rename to src/tests/embedding_model_choice.py
diff --git a/structure.txt b/structure.txt
new file mode 100644
index 0000000..99747c7
--- /dev/null
+++ b/structure.txt
@@ -0,0 +1,26 @@
+NLP/
+├── data/
+│   ├── processed/                   # Processed data and embeddings
+│   ├── raw/                         # RAW data
+│   ├── process_data.py              # Functions to process data
+├── src/
+│   ├── __init__.py
+│   ├── data_processing/
+│   │   ├── __init__.py
+│   │   ├── get_embeddings.py   # Functions to generate embeddings
+│   ├── database/
+│   │   ├── __init__.py
+│   │   ├── chromadb_search_functions.py       # chromadb functions to retrieved responses
+│   ├── models/
+│   │   ├── __init__.py
+│   │   ├── models.py                # All the declared models (❌)
+│   │   ├── main_reasoning.py        # Main project  (Not yet set up ❌)
+│   ├── app.py                       # Main application logic
+├── tests/ 
+│   ├── __init__.py
+│   ├── embedding_model_choice.py      # Unit tests for the right embedding model
+├── requirements.txt                 # Project dependencies
+├── README.md                        # Project overview and instructions
+├── setup.py                         # Package installation script
+├── structure.txt                         # Struture of the project
+└── .gitignore                       # Git ignore file
\ No newline at end of file