Skip to content

Commit

Permalink
Modified the Structure of the project
Browse files Browse the repository at this point in the history
  • Loading branch information
AnasAber committed Aug 2, 2024
1 parent 1437c26 commit 3ab5597
Show file tree
Hide file tree
Showing 29 changed files with 571 additions and 185 deletions.
3 changes: 2 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
.env
FlagEmbedding-1.2.10/
rag/
testing.py
t.py
s.txt
12 changes: 12 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,18 @@ Retrieved chunks are reranked using the Cohere API to ensure the most relevant c

The top-ranked chunks are passed to the Llama model (via Groq API) to generate a coherent and relevant response.

### How to start

1. Clone the repository
`git clone https://github.com/AnasAber/RAG_in_CPU.git`
2. Activate the virtual environement:
for Windows: `./rag/Scripts/activate`
for MacOS/Linux: `source venv/bin/activate`
3. Run the `app.py` file

The reason why I'm using a virtual environment is to avoid any conflicts with the dependencies (I had to manually change things in configuration files), and to make sure that the project runs smoothly.


This project's RAG uses semantic search using ChromaDB, I'll work on doing a combination of Hybrid Search and a HyDE following the best practices of RAG mentioned in the following paper: [link](https://arxiv.org/html/2407.01219v1#:~:text=A%20typical%20RAG%20workflow%20usually,based%20on%20their%20relevance%20to)

![System Architecture Diagram](images/x1.png)
Expand Down
Binary file modified __pycache__/chroma_search_functions.cpython-310.pyc
Binary file not shown.
Binary file modified __pycache__/get_embeddings.cpython-310.pyc
Binary file not shown.
47 changes: 33 additions & 14 deletions query_data.py → app.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,17 @@
import get_embeddings
import chroma_search_functions as csf
import src.data_processing.get_embeddings
from data.process_data import load_documents, embed_and_store_documents, split_documents
from langchain.prompts import ChatPromptTemplate
from transformers import pipeline
from langchain_community.vectorstores import Chroma
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
from langchain.llms import HuggingFacePipeline
from groq import Groq
import os
from src.database.chroma_search_functions import get_relevant_data

"""
Importing the functions and setting up the environment variables
"""

CHROMA_PATH = "chroma/"
DATA_PATH = "data"

(load_documents, split_documents, embed_and_store_documents, retrieve_documents, get_relevant_data, add_to_chroma_db, get_chroma_db) = csf.main()
DATA_PATH = "data/raw"

client = Groq(
api_key=os.getenv("GROQ_API_KEY"),
Expand All @@ -27,6 +21,11 @@
"""
Again, if we want to load a huggingFace model and tokenizer, we can do it like this:
from transformers import pipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import transformers
from langchain.llms import HuggingFacePipeline
model_name = "microsoft/Phi-3-mini-4k-instruct"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(
Expand All @@ -53,18 +52,38 @@
def format_context(context):
return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)])

def check_and_process_documents():
path = "data/processed/chroma"
print(f"Checking if path exists: {path}")

if not os.path.exists(path):
print(f"Path does not exist: {path}")

documents = load_documents()
print("Documents loaded")

chunks = split_documents(documents)
print("Documents split into chunks")

embed_and_store_documents(chunks)
print("Documents embedded and stored")
else:
print(f"Path already exists: {path}")

def main():

"""
loading documents should be performed only once, ti will take a bit of time at first.
You can comment them out as chromaDB has the infos already
"""
check_and_process_documents()

documents = load_documents()
chunks = split_documents(documents)
embed_and_store_documents(chunks)
print("Documents loaded, split, and stored")
if not os.path.exists("data/processed/chroma"):
documents = load_documents()
chunks = split_documents(documents)
embed_and_store_documents(chunks)
print("Documents loaded, split, and stored")



Expand Down
154 changes: 0 additions & 154 deletions chroma_search_functions.py

This file was deleted.

Binary file added data/__pycache__/process_data.cpython-310.pyc
Binary file not shown.
53 changes: 53 additions & 0 deletions data/process_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain.schema.document import Document
import os
from src.data_processing.get_embeddings import get_embeddings
from dotenv import load_dotenv


load_dotenv()

"""
Initializating the APIs and setting up the environment variables
"""

api_key = os.getenv("COHERE_API_KEY")

CHROMA_PATH = "data/processed/chroma"
DATA_PATH = "data"


def load_documents():
document_loader = PyPDFDirectoryLoader("data/")
print("Loading documents...")
return document_loader.load()


def split_documents(documents):
text_splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=100,
length_function=len,
is_separator_regex=False,
)
docs = []
print("Splitting documents...")
for document in documents:
for chunk in text_splitter.split_text(document.page_content):
docs.append(Document(page_content=chunk, metadata={"source": document.metadata["source"]}))
print("Documents split successfully.")
return docs


def embed_and_store_documents(chunks):
chroma_db = Chroma(
persist_directory=CHROMA_PATH, embedding_function=get_embeddings()
)
print("Storing documents...")
chroma_db.add_documents(chunks, persist_directory=CHROMA_PATH,embeddings=get_embeddings())
chroma_db.persist()
print("Documents stored successfully.")

Binary file not shown.
Binary file not shown.
File renamed without changes.
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 3ab5597

Please sign in to comment.