diff --git a/.gitignore b/.gitignore index c4c8c5c..eb3abb3 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ .env FlagEmbedding-1.2.10/ -rag/ testing.py +t.py +s.txt diff --git a/README.md b/README.md index 244b466..6104d46 100644 --- a/README.md +++ b/README.md @@ -26,6 +26,18 @@ Retrieved chunks are reranked using the Cohere API to ensure the most relevant c The top-ranked chunks are passed to the Llama model (via Groq API) to generate a coherent and relevant response. +### How to start + +1. Clone the repository + `git clone https://github.com/AnasAber/RAG_in_CPU.git` +2. Activate the virtual environement: + for Windows: `./rag/Scripts/activate` + for MacOS/Linux: `source venv/bin/activate` +3. Run the `app.py` file + +The reason why I'm using a virtual environment is to avoid any conflicts with the dependencies (I had to manually change things in configuration files), and to make sure that the project runs smoothly. + + This project's RAG uses semantic search using ChromaDB, I'll work on doing a combination of Hybrid Search and a HyDE following the best practices of RAG mentioned in the following paper: [link](https://arxiv.org/html/2407.01219v1#:~:text=A%20typical%20RAG%20workflow%20usually,based%20on%20their%20relevance%20to) ![System Architecture Diagram](images/x1.png) diff --git a/__pycache__/chroma_search_functions.cpython-310.pyc b/__pycache__/chroma_search_functions.cpython-310.pyc index 5c91055..66abf60 100644 Binary files a/__pycache__/chroma_search_functions.cpython-310.pyc and b/__pycache__/chroma_search_functions.cpython-310.pyc differ diff --git a/__pycache__/get_embeddings.cpython-310.pyc b/__pycache__/get_embeddings.cpython-310.pyc index 8d2eb50..4d4ff5e 100644 Binary files a/__pycache__/get_embeddings.cpython-310.pyc and b/__pycache__/get_embeddings.cpython-310.pyc differ diff --git a/query_data.py b/app.py similarity index 79% rename from query_data.py rename to app.py index 8cdc067..9fd454a 100644 --- a/query_data.py +++ b/app.py @@ -1,13 +1,9 @@ -import get_embeddings -import chroma_search_functions as csf +import src.data_processing.get_embeddings +from data.process_data import load_documents, embed_and_store_documents, split_documents from langchain.prompts import ChatPromptTemplate -from transformers import pipeline -from langchain_community.vectorstores import Chroma -from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig -import transformers -from langchain.llms import HuggingFacePipeline from groq import Groq import os +from src.database.chroma_search_functions import get_relevant_data """ Importing the functions and setting up the environment variables @@ -15,9 +11,7 @@ """ CHROMA_PATH = "chroma/" -DATA_PATH = "data" - -(load_documents, split_documents, embed_and_store_documents, retrieve_documents, get_relevant_data, add_to_chroma_db, get_chroma_db) = csf.main() +DATA_PATH = "data/raw" client = Groq( api_key=os.getenv("GROQ_API_KEY"), @@ -27,6 +21,11 @@ """ Again, if we want to load a huggingFace model and tokenizer, we can do it like this: +from transformers import pipeline +from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig +import transformers +from langchain.llms import HuggingFacePipeline + model_name = "microsoft/Phi-3-mini-4k-instruct" tokenizer = AutoTokenizer.from_pretrained(model_name) model = AutoModelForCausalLM.from_pretrained( @@ -53,6 +52,24 @@ def format_context(context): return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)]) +def check_and_process_documents(): + path = "data/processed/chroma" + print(f"Checking if path exists: {path}") + + if not os.path.exists(path): + print(f"Path does not exist: {path}") + + documents = load_documents() + print("Documents loaded") + + chunks = split_documents(documents) + print("Documents split into chunks") + + embed_and_store_documents(chunks) + print("Documents embedded and stored") + else: + print(f"Path already exists: {path}") + def main(): """ @@ -60,11 +77,13 @@ def main(): You can comment them out as chromaDB has the infos already """ + check_and_process_documents() - documents = load_documents() - chunks = split_documents(documents) - embed_and_store_documents(chunks) - print("Documents loaded, split, and stored") + if not os.path.exists("data/processed/chroma"): + documents = load_documents() + chunks = split_documents(documents) + embed_and_store_documents(chunks) + print("Documents loaded, split, and stored") diff --git a/chroma_search_functions.py b/chroma_search_functions.py deleted file mode 100644 index 3f3a680..0000000 --- a/chroma_search_functions.py +++ /dev/null @@ -1,154 +0,0 @@ -from langchain_community.vectorstores import Chroma -from langchain.text_splitter import RecursiveCharacterTextSplitter -from langchain_community.document_loaders import PyPDFDirectoryLoader -from langchain.schema.document import Document -# from FlagEmbedding.flag_models import FlagModel -# from FlagEmbedding.flag_reranker import FlagReranker -from get_embeddings import get_embeddings -import uuid -import os -from dotenv import load_dotenv -import cohere - -load_dotenv() - -""" - Initializating the APIs and setting up the environment variables - -""" - -api_key = os.getenv("COHERE_API_KEY") - -CHROMA_PATH = "chroma" -DATA_PATH = "data" - -# init client -co = cohere.Client(api_key=api_key) - - - - -# load the data -def get_chroma_db(get_embeddings=get_embeddings): - return Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embeddings()) - - -def main(): - - - def load_documents(): - document_loader = PyPDFDirectoryLoader("data/") - print("Loading documents...") - return document_loader.load() - - - def split_documents(documents): - text_splitter = RecursiveCharacterTextSplitter( - chunk_size=500, - chunk_overlap=100, - length_function=len, - is_separator_regex=False, - ) - docs = [] - print("Splitting documents...") - for document in documents: - for chunk in text_splitter.split_text(document.page_content): - docs.append(Document(page_content=chunk, metadata={"source": document.metadata["source"]})) - print("Documents split successfully.") - return docs - - - def embed_and_store_documents(chunks): - chroma_db = Chroma( - persist_directory=CHROMA_PATH, embedding_function=get_embeddings() - ) - print("Storing documents...") - chroma_db.add_documents(chunks, persist_directory=CHROMA_PATH,embeddings=get_embeddings()) - chroma_db.persist() - print("Documents stored successfully.") - - - - def retrieve_documents(query, top_k=5): - chroma_db = get_chroma_db() - print("#"*100 + "\n\n") - - print("Retrieving documents...") - results = chroma_db.similarity_search_with_score(query, top_k) - context_text= "\n\n---\n\n".join([doc.page_content for doc, _score in results]) - - print("Documents before reranking: ", context_text) - - return context_text - - - """ - If you want to use the FlagReranker to rerank the retrieved documents, you can use the following code snippet: - - reranker = FlagModel("BAAI/bge-reranker-v2-m3", use_fp16=True) - - def reranked_documents(query, retrieved_chunks, top_k=3): - reranked_chunks = reranker.predict(query, retrieved_chunks) - return [chunk for chunk, _ in reranked_chunks[:top_k]] - - Initialize the FlagReranker - reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) - - - I'll personally use the cohere API to rerank the documents. - """ - - - def format_context(context): - return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)]) - - - def reranked_documents(query, long_string, top_k=3): - # Split the long string into individual chunks using '\n\n---\n\n' as the separator - chunks = long_string.split("\n\n---\n\n") - - # Ensure all chunks are valid (non-empty) and strip leading/trailing whitespace - valid_chunks = [chunk.strip() for chunk in chunks if chunk.strip()] - - if not valid_chunks: - print("No valid chunks to rerank.") - return [] - - # Use the cohere rerank API - rerank_docs = co.rerank( - query=query, - documents=valid_chunks, - top_n=top_k, - model="rerank-english-v2.0" - ) - - print("#"*100 + "\n\n") - # Extract and print reranked chunks using the indices from the rerank response - reranked_chunks = [valid_chunks[result.index] for result in rerank_docs.results] - print("Reranked Chunks:\n\n", format_context(reranked_chunks)) - - return reranked_chunks - - - def get_relevant_data(query): - retrieved_chunks = retrieve_documents(query) - reranked_chunks = reranked_documents(query, retrieved_chunks) - return reranked_chunks - - - - def add_to_chroma_db(reranked_chunks): - chroma_db = get_chroma_db() - chroma_db.add_documents(reranked_chunks) - chroma_db.persist() - - - return load_documents, split_documents, embed_and_store_documents, retrieve_documents, get_relevant_data, add_to_chroma_db, get_chroma_db - - -if __name__ == "__main__": - main() - - - - diff --git a/data/__pycache__/process_data.cpython-310.pyc b/data/__pycache__/process_data.cpython-310.pyc new file mode 100644 index 0000000..b79cb65 Binary files /dev/null and b/data/__pycache__/process_data.cpython-310.pyc differ diff --git a/data/process_data.py b/data/process_data.py new file mode 100644 index 0000000..5c4b010 --- /dev/null +++ b/data/process_data.py @@ -0,0 +1,53 @@ +from langchain_community.vectorstores import Chroma +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFDirectoryLoader +from langchain.schema.document import Document +import os +from src.data_processing.get_embeddings import get_embeddings +from dotenv import load_dotenv + + +load_dotenv() + +""" + Initializating the APIs and setting up the environment variables + +""" + +api_key = os.getenv("COHERE_API_KEY") + +CHROMA_PATH = "data/processed/chroma" +DATA_PATH = "data" + + +def load_documents(): + document_loader = PyPDFDirectoryLoader("data/") + print("Loading documents...") + return document_loader.load() + + +def split_documents(documents): + text_splitter = RecursiveCharacterTextSplitter( + chunk_size=500, + chunk_overlap=100, + length_function=len, + is_separator_regex=False, + ) + docs = [] + print("Splitting documents...") + for document in documents: + for chunk in text_splitter.split_text(document.page_content): + docs.append(Document(page_content=chunk, metadata={"source": document.metadata["source"]})) + print("Documents split successfully.") + return docs + + +def embed_and_store_documents(chunks): + chroma_db = Chroma( + persist_directory=CHROMA_PATH, embedding_function=get_embeddings() + ) + print("Storing documents...") + chroma_db.add_documents(chunks, persist_directory=CHROMA_PATH,embeddings=get_embeddings()) + chroma_db.persist() + print("Documents stored successfully.") + diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/data_level0.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/data_level0.bin similarity index 100% rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/data_level0.bin rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/data_level0.bin diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/header.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/header.bin similarity index 100% rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/header.bin rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/header.bin diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/length.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/length.bin similarity index 98% rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/length.bin rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/length.bin index 1dc89f8..f5fee2d 100644 Binary files a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/length.bin and b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/length.bin differ diff --git a/chroma/1bf717b4-2400-4375-8363-fca32070ff78/link_lists.bin b/data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/link_lists.bin similarity index 100% rename from chroma/1bf717b4-2400-4375-8363-fca32070ff78/link_lists.bin rename to data/processed/chroma/be23e90f-70b1-487f-afee-13fec0ce30f7/link_lists.bin diff --git a/chroma/chroma.sqlite3 b/data/processed/chroma/chroma.sqlite3 similarity index 85% rename from chroma/chroma.sqlite3 rename to data/processed/chroma/chroma.sqlite3 index 8c82ad4..551ecd7 100644 Binary files a/chroma/chroma.sqlite3 and b/data/processed/chroma/chroma.sqlite3 differ diff --git a/data/ReAct for LLMs.pdf b/data/raw/ReAct for LLMs.pdf similarity index 100% rename from data/ReAct for LLMs.pdf rename to data/raw/ReAct for LLMs.pdf diff --git a/data/generative ai.pdf b/data/raw/generative ai.pdf similarity index 100% rename from data/generative ai.pdf rename to data/raw/generative ai.pdf diff --git a/data/monopoly.pdf b/data/raw/monopoly.pdf similarity index 100% rename from data/monopoly.pdf rename to data/raw/monopoly.pdf diff --git a/requirements.txt b/requirements.txt index d1aad4c..6eb2b21 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,17 +1,163 @@ -pypdf +accelerate +aiohttp +annotated-types +anyio +asgiref +async-timeout +attrs +backoff +bcrypt +boto3 +botocore +build +cachetools +certifi +charset-normalizer +chroma-hnswlib +chromadb +click +cohere +colorama +coloredlogs +dataclasses-json +datasets +Deprecated +dill +distro +dnspython +einops +email_validator +exceptiongroup +fastapi +fastapi-cli +fastavro +filelock +FlagEmbedding +flatbuffers +frozenlist +fsspec +google-auth +googleapis-common-protos +greenlet +groq +grpcio +h11 +httpcore +httptools +httpx +httpx-sse +huggingface-hub +humanfriendly +idna +importlib_metadata +importlib_resources +iniconfig +intel-openmp +Jinja2 +jmespath +joblib +jsonpatch +jsonpointer +kubernetes langchain langchain-community -chromadb +langchain-core +langchain-text-splitters +langsmith +markdown-it-py +MarkupSafe +marshmallow +mdurl +mkl +mmh3 +monotonic +mpmath +multidict +multiprocess +mypy-extensions +networkx numpy -scikit-learn +oauthlib +onnx +onnxruntime +opentelemetry-api +opentelemetry-exporter-otlp-proto-common +opentelemetry-exporter-otlp-proto-grpc +opentelemetry-instrumentation +opentelemetry-instrumentation-asgi +opentelemetry-instrumentation-fastapi +opentelemetry-proto +opentelemetry-sdk +opentelemetry-semantic-conventions +opentelemetry-util-http +orjson +overrides +packaging +pandas +parameterized +pillow +pip +pluggy +posthog +protobuf +psutil +pyarrow +pyarrow-hotfix +pyasn1 +pyasn1_modules +pydantic +pydantic_core +Pygments +pypdf +PyPika +pyproject_hooks +pyreadline3 pytest +python-dateutil python-dotenv -transformers -FlagEmbedding -einops -joblib +python-multipart +pytz +PyYAML +regex +requests +requests-oauthlib +rich +rsa +s3transfer +safetensors +scikit-learn +scipy +sentence-transformers +setuptools +shellingham +six +sniffio +SQLAlchemy +starlette +sympy +tbb +tenacity +threadpoolctl +tokenizers +tomli torch -onnx -onnxruntime -cohere -groq \ No newline at end of file +torchaudio +torchvision +tqdm +transformers +typer +types-requests +typing_extensions +typing-inspect +tzdata +ujson +urllib3 +uvicorn +watchfiles +websocket-client +websockets +wheel +wrapt +xxhash +yarl +zipp diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..42ba147 --- /dev/null +++ b/setup.py @@ -0,0 +1,174 @@ +from setuptools import setup, find_packages + +setup( + name='RAG_for_CPU', + version='1.0', + author='Anas Aberchih', + author_email='anas.aberchih1@gmail.com', + packages=find_packages(), + install_requires=[ + 'accelerate', + 'aiohttp', + 'annotated-types', + 'anyio', + 'asgiref', + 'async-timeout', + 'attrs', + 'backoff', + 'bcrypt', + 'boto3', + 'botocore', + 'build', + 'cachetools', + 'certifi', + 'charset-normalizer', + 'chroma-hnswlib', + 'chromadb', + 'click', + 'cohere', + 'colorama', + 'coloredlogs', + 'dataclasses-json', + 'datasets', + 'Deprecated', + 'dill', + 'distro', + 'dnspython', + 'einops', + 'email_validator', + 'exceptiongroup', + 'fastapi', + 'fastapi-cli', + 'fastavro', + 'filelock', + 'FlagEmbedding', + 'flatbuffers', + 'frozenlist', + 'fsspec', + 'google-auth', + 'googleapis-common-protos', + 'greenlet', + 'groq', + 'grpcio', + 'h11', + 'httpcore', + 'httptools', + 'httpx', + 'httpx-sse', + 'huggingface-hub', + 'humanfriendly', + 'idna', + 'importlib_metadata', + 'importlib_resources', + 'iniconfig', + 'intel-openmp', + 'Jinja2', + 'jmespath', + 'joblib', + 'jsonpatch', + 'jsonpointer', + 'kubernetes', + 'langchain', + 'langchain-community', + 'langchain-core', + 'langchain-text-splitters', + 'langsmith', + 'markdown-it-py', + 'MarkupSafe', + 'marshmallow', + 'mdurl', + 'mkl', + 'mmh3', + 'monotonic', + 'mpmath', + 'multidict', + 'multiprocess', + 'mypy-extensions', + 'networkx', + 'numpy', + 'oauthlib', + 'onnx', + 'onnxruntime', + 'opentelemetry-api', + 'opentelemetry-exporter-otlp-proto-common', + 'opentelemetry-exporter-otlp-proto-grpc', + 'opentelemetry-instrumentation', + 'opentelemetry-instrumentation-asgi', + 'opentelemetry-instrumentation-fastapi', + 'opentelemetry-proto', + 'opentelemetry-sdk', + 'opentelemetry-semantic-conventions', + 'opentelemetry-util-http', + 'orjson', + 'overrides', + 'packaging', + 'pandas', + 'parameterized', + 'pillow', + 'pip', + 'pluggy', + 'posthog', + 'protobuf', + 'psutil', + 'pyarrow', + 'pyarrow-hotfix', + 'pyasn1', + 'pyasn1_modules', + 'pydantic', + 'pydantic_core', + 'Pygments', + 'pypdf', + 'PyPika', + 'pyproject_hooks', + 'pyreadline3', + 'pytest', + 'python-dateutil', + 'python-dotenv', + 'python-multipart', + 'pytz', + 'PyYAML', + 'regex', + 'requests', + 'requests-oauthlib', + 'rich', + 'rsa', + 's3transfer', + 'safetensors', + 'scikit-learn', + 'scipy', + 'sentence-transformers', + 'setuptools', + 'shellingham', + 'six', + 'sniffio', + 'SQLAlchemy', + 'starlette', + 'sympy', + 'tbb', + 'tenacity', + 'threadpoolctl', + 'tokenizers', + 'tomli', + 'torch', + 'torchaudio', + 'torchvision', + 'tqdm', + 'transformers', + 'typer', + 'types-requests', + 'typing_extensions', + 'typing-inspect', + 'tzdata', + 'ujson', + 'urllib3', + 'uvicorn', + 'watchfiles', + 'websocket-client', + 'websockets', + 'wheel', + 'wrapt', + 'xxhash', + 'yarl', + 'zipp', + ], +) diff --git a/src/data_processing/__init__.py b/src/data_processing/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/data_processing/__pycache__/__init__.cpython-310.pyc b/src/data_processing/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000..8c5dbd5 Binary files /dev/null and b/src/data_processing/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/data_processing/__pycache__/get_embeddings.cpython-310.pyc b/src/data_processing/__pycache__/get_embeddings.cpython-310.pyc new file mode 100644 index 0000000..f3788d2 Binary files /dev/null and b/src/data_processing/__pycache__/get_embeddings.cpython-310.pyc differ diff --git a/get_embeddings.py b/src/data_processing/get_embeddings.py similarity index 94% rename from get_embeddings.py rename to src/data_processing/get_embeddings.py index 6657139..916df65 100644 --- a/get_embeddings.py +++ b/src/data_processing/get_embeddings.py @@ -33,10 +33,5 @@ def get_embeddings(text=None): embeddings = HuggingFaceEmbeddings(model_name="nomic-ai/nomic-embed-text-v1") return embeddings -__all__ = ['get_embeddings', 'get_embeddings_query'] - - - - # (1, 8, 1024) diff --git a/src/database/__pycache__/chroma_search_functions.cpython-310.pyc b/src/database/__pycache__/chroma_search_functions.cpython-310.pyc new file mode 100644 index 0000000..f675943 Binary files /dev/null and b/src/database/__pycache__/chroma_search_functions.cpython-310.pyc differ diff --git a/src/database/chroma_search_functions.py b/src/database/chroma_search_functions.py new file mode 100644 index 0000000..6a887d4 --- /dev/null +++ b/src/database/chroma_search_functions.py @@ -0,0 +1,114 @@ +from langchain_community.vectorstores import Chroma +from langchain.text_splitter import RecursiveCharacterTextSplitter +from langchain_community.document_loaders import PyPDFDirectoryLoader +from langchain.schema.document import Document +# from FlagEmbedding.flag_models import FlagModel +# from FlagEmbedding.flag_reranker import FlagReranker +from src.data_processing.get_embeddings import get_embeddings +import uuid +import os +from dotenv import load_dotenv +import cohere + +load_dotenv() + +""" + Initializating the APIs and setting up the environment variables + +""" + +api_key = os.getenv("COHERE_API_KEY") + +CHROMA_PATH = "data/processed/chroma" + +# init client +co = cohere.Client(api_key) + + + + +# load the data +def get_chroma_db(get_embeddings=get_embeddings): + return Chroma(persist_directory=CHROMA_PATH, embedding_function=get_embeddings()) + + + +def retrieve_documents(query, top_k=5): + chroma_db = get_chroma_db() + print("#"*100 + "\n\n") + + print("Retrieving documents...") + results = chroma_db.similarity_search_with_score(query, top_k) + context_text= "\n\n---\n\n".join([doc.page_content for doc, _score in results]) + + print("Documents before reranking: ", context_text) + + return context_text + + +""" +If you want to use the FlagReranker to rerank the retrieved documents, you can use the following code snippet: + + reranker = FlagModel("BAAI/bge-reranker-v2-m3", use_fp16=True) + + def reranked_documents(query, retrieved_chunks, top_k=3): + reranked_chunks = reranker.predict(query, retrieved_chunks) + return [chunk for chunk, _ in reranked_chunks[:top_k]] + + Initialize the FlagReranker + reranker = FlagReranker('BAAI/bge-reranker-v2-m3', use_fp16=True) + + +I'll personally use the cohere API to rerank the documents. +""" + + +def format_context(context): + return "\n\n".join([f"Chunk {i+1}: {chunk}" for i, chunk in enumerate(context)]) + + +def reranked_documents(query, long_string, top_k=3): + # Split the long string into individual chunks using '\n\n---\n\n' as the separator + chunks = long_string.split("\n\n---\n\n") + + # Ensure all chunks are valid (non-empty) and strip leading/trailing whitespace + valid_chunks = [chunk.strip() for chunk in chunks if chunk.strip()] + + if not valid_chunks: + print("No valid chunks to rerank.") + return [] + + # Use the cohere rerank API + rerank_docs = co.rerank( + query=query, + documents=valid_chunks, + top_n=top_k, + model="rerank-english-v2.0" + ) + + print("#"*100 + "\n\n") + # Extract and print reranked chunks using the indices from the rerank response + reranked_chunks = [valid_chunks[result.index] for result in rerank_docs.results] + print("Reranked Chunks:\n\n", format_context(reranked_chunks)) + + return reranked_chunks + + +def get_relevant_data(query): + retrieved_chunks = retrieve_documents(query) + reranked_chunks = reranked_documents(query, retrieved_chunks) + return reranked_chunks + + + +def add_to_chroma_db(reranked_chunks): + chroma_db = get_chroma_db() + chroma_db.add_documents(reranked_chunks) + chroma_db.persist() + + + + + + + diff --git a/images/RAG_in_CPU.gif b/src/images/RAG_in_CPU.gif similarity index 100% rename from images/RAG_in_CPU.gif rename to src/images/RAG_in_CPU.gif diff --git a/images/x1.png b/src/images/x1.png similarity index 100% rename from images/x1.png rename to src/images/x1.png diff --git a/src/processed/chroma/chroma.sqlite3 b/src/processed/chroma/chroma.sqlite3 new file mode 100644 index 0000000..87d0279 Binary files /dev/null and b/src/processed/chroma/chroma.sqlite3 differ diff --git a/embedding_model_choice.py b/src/tests/embedding_model_choice.py similarity index 100% rename from embedding_model_choice.py rename to src/tests/embedding_model_choice.py diff --git a/structure.txt b/structure.txt new file mode 100644 index 0000000..99747c7 --- /dev/null +++ b/structure.txt @@ -0,0 +1,26 @@ +NLP/ +├── data/ +│ ├── processed/ # Processed data and embeddings +│ ├── raw/ # RAW data +│ ├── process_data.py # Functions to process data +├── src/ +│ ├── __init__.py +│ ├── data_processing/ +│ │ ├── __init__.py +│ │ ├── get_embeddings.py # Functions to generate embeddings +│ ├── database/ +│ │ ├── __init__.py +│ │ ├── chromadb_search_functions.py # chromadb functions to retrieved responses +│ ├── models/ +│ │ ├── __init__.py +│ │ ├── models.py # All the declared models (❌) +│ │ ├── main_reasoning.py # Main project (Not yet set up ❌) +│ ├── app.py # Main application logic +├── tests/ +│ ├── __init__.py +│ ├── embedding_model_choice.py # Unit tests for the right embedding model +├── requirements.txt # Project dependencies +├── README.md # Project overview and instructions +├── setup.py # Package installation script +├── structure.txt # Struture of the project +└── .gitignore # Git ignore file \ No newline at end of file