Skip to content

Commit

Permalink
updated libs , fixed breaking changes and added code for the community
Browse files Browse the repository at this point in the history
  • Loading branch information
Luca Pierpaoli committed Jan 19, 2024
1 parent 7946c07 commit 3c1e703
Show file tree
Hide file tree
Showing 8 changed files with 1,442 additions and 616 deletions.
8 changes: 8 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -164,3 +164,11 @@ data/chroma/863ec92d-3170-4dcc-b3ac-7b99a5b05b5e/data_level0.bin
data/chroma/863ec92d-3170-4dcc-b3ac-7b99a5b05b5e/header.bin
data/chroma/863ec92d-3170-4dcc-b3ac-7b99a5b05b5e/length.bin
data/chroma/863ec92d-3170-4dcc-b3ac-7b99a5b05b5e/link_lists.bin
.env_example
docs/Perizia.pdf
docs/detrazioni.txt
sample-docs/detrazioni-miste.txt
data/chroma/e2c49fd5-6e9e-4f7f-a342-50ec3a0e0cda/data_level0.bin
data/chroma/e2c49fd5-6e9e-4f7f-a342-50ec3a0e0cda/header.bin
data/chroma/e2c49fd5-6e9e-4f7f-a342-50ec3a0e0cda/length.bin
data/chroma/e2c49fd5-6e9e-4f7f-a342-50ec3a0e0cda/link_lists.bin
2 changes: 2 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ azure-identity = "==1.15.0"
chromadb = "*"
llama-index = "*"
traceloop-sdk = "*"
langchain-openai = "*"
sentence-transformers = "*"

[dev-packages]
ipykernel = "*"
Expand Down
1,860 changes: 1,250 additions & 610 deletions Pipfile.lock

Large diffs are not rendered by default.

176 changes: 176 additions & 0 deletions community/import-data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
from langchain.chains import ConversationalRetrievalChain
from langchain_openai import AzureChatOpenAI
from langchain_openai import AzureOpenAIEmbeddings
from langchain_community.vectorstores.chroma import Chroma
from langchain.text_splitter import (
RecursiveCharacterTextSplitter,
SentenceTransformersTokenTextSplitter,
)
from pypdf import PdfReader
from langchain.document_loaders import PyPDFLoader

import os
from dotenv import load_dotenv

load_dotenv()

persist_directory = "data/chroma"
collection_name = "book_eng"

# embedding
embedding = AzureOpenAIEmbeddings(
deployment=os.getenv("OPENAI_EMBEDDING_DEPLOYMENT_NAME"),
chunk_size=1,
embedding_ctx_length=1000,
)

# llm
llm = AzureChatOpenAI(
azure_deployment=os.getenv("OPENAI_CHAT_DEPLOYMENT_NAME"),
openai_api_type="azure",
temperature=0.0,
)

vector_db = Chroma(
collection_name=collection_name,
persist_directory=persist_directory,
embedding_function=embedding,
)


def load_pdf(pdf_path: str):
from io import BytesIO

# Rec char Splitter
character_splitter = RecursiveCharacterTextSplitter(
chunk_size=1000,
chunk_overlap=0,
separators=["\n\n", "\n", ". ", " ", ""],
)

# sentence token splitter
token_splitter = SentenceTransformersTokenTextSplitter(
chunk_overlap=0, tokens_per_chunk=256
)

pdf_bytes = None
with open(pdf_path, "rb") as f:
pdf_bytes = f.read()

doc = PdfReader(BytesIO(pdf_bytes))
docs = []
for page_num in range(len(doc.pages)):
pdf_page = doc.pages[page_num]
pdf_page_text = pdf_page.extract_text()

# skip empty pages
if not pdf_page_text:
continue

# split text
character_split_texts = character_splitter.split_text(pdf_page_text)

token_split_texts = []
for text in character_split_texts:
token_split_texts += token_splitter.split_text(text)

# create metadata from token split
page_nr = int(page_num + 1)

# set metadata for each split
metadatas = [{"source": pdf_path, "page": page_nr} for _ in token_split_texts]

# convert to document
documents = character_splitter.create_documents(
texts=token_split_texts, metadatas=metadatas
)

docs.extend(documents)

db = Chroma.from_documents(
collection_name=collection_name,
documents=docs,
embedding=embedding,
persist_directory=persist_directory,
)


# just for testing
def get_relevant_documents(question: str):
docs = vector_db.similarity_search(question)
return docs


def ask(
query: str,
) -> str:
"""Ask to LLM and return the answer"""
from langchain.retrievers import MultiQueryRetriever
from langchain.prompts import PromptTemplate

# Continue with the question using doc retriever
# get the prompt
final_prompt = """
You are a friendly helpful assistant to help and maintain polite conversation.
Your users are asking questions about information retrieved from a book.
Answer the user's question using only these information.
Remember to be polite and friendly.
Context:
{context}
Question:
{question}
Answer:
"""

PROMPT = PromptTemplate(
template=final_prompt,
input_variables=["context", "question"],
)

# see on langsmith
retriever = MultiQueryRetriever.from_llm(
vector_db.as_retriever(
search_type="mmr",
search_kwargs={"k": 3, "score_threshold": 0.9},
),
llm=llm,
)

chain_type_kwargs = {"prompt": PROMPT}
qa = ConversationalRetrievalChain.from_llm(
llm,
retriever,
return_source_documents=True,
verbose=True,
combine_docs_chain_kwargs=chain_type_kwargs,
rephrase_question=False,
)

llm_result = qa.invoke({"question": query, "chat_history": []})
return llm_result


if __name__ == "__main__":
### code examples based on https://amzn.eu/d/5LZBc6p
import logging

# FOR LOGGING MULTIQUERY
logging.basicConfig()
logging.getLogger("langchain.retrievers.multi_query").setLevel(logging.INFO)

# LOAD PDF
pdf_path = os.getenv("FILE_PATH_ENG")
print(pdf_path)

# load_pdf(pdf_path)
# print("pdf loaded")

# result = get_relevant_documents("What's the name of the singer?")
# print(result)

# result = ask("What's the name of the singer?")
result = ask("What is the instrument played by Fred?")
print(result["answer"])
Empty file added community/ragas-evaluation.py
Empty file.
4 changes: 2 additions & 2 deletions lcel/rag-sample.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@

def _get_retriever():
vectorstore = Chroma.from_texts(
["Luca is passionate about LLMs and GenerativeAi"], embedding=embedding
["Sam is passionate about LLMs and GenerativeAi"], embedding=embedding
)
retriever = vectorstore.as_retriever()
return retriever
Expand Down Expand Up @@ -87,7 +87,7 @@ def rag_with_different_inputs(question: str, style: str):


if __name__ == "__main__":
question = "What is Luca passionate about?"
question = "What is Sam passionate about?"

# result = simple_rag(question=question)
style = "Heavy Metal Fan"
Expand Down
4 changes: 2 additions & 2 deletions rag/rag-lama.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,8 +39,8 @@
documents = [Document(text=file_content)]


api_key = os.getenv("OPENAI_API_KEY")
azure_endpoint = os.getenv("OPENAI_API_BASE")
api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_version = os.getenv("OPENAI_API_VERSION")

llm = AzureOpenAI(
Expand Down
4 changes: 2 additions & 2 deletions rag/rag-llama-window-retrieval.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,8 @@


### THE LLM
api_key = os.getenv("OPENAI_API_KEY")
azure_endpoint = os.getenv("OPENAI_API_BASE")
api_key = os.getenv("AZURE_OPENAI_API_KEY")
azure_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
api_version = os.getenv("OPENAI_API_VERSION")

llm = AzureOpenAI(
Expand Down

0 comments on commit 3c1e703

Please sign in to comment.