Skip to content

Commit

Permalink
RAGAS and LLM as a Judge evaluation (#256)
Browse files Browse the repository at this point in the history
* added notebook that showcases copilot application suing NIM, embedding model and FAISS VDB. All these tools are integrated using Langchain plugins. The notebook also has a Gradio-based UI to interact with the application

* added images for the notebook and minor tweaks in the notebook

* typo correction

* Update langchain_copilot_with_NIM_HF_FAISS_deployed_locally.ipynb

* Update langchain_copilot_with_NIM_HF_FAISS_deployed_locally.ipynb

corrected typo

* file management: moved notebooks under llm_video_series

* changed one dependencies

* added image files for notebook

* removed images not used

* removed auto generated folders and README.md

* added RAGAS evaluation. (1) Added changes to save top-n context pulled by retriever that is needed for downstream RAGAS evaluation. (2) modified UI to pick which evalutaion to use from drop down

* Added LLM as a judge to the evaluation

* added LLM as a judge, argument to read only pdf files from the folder, RAGAS evaluation

---------

Co-authored-by: Jay Rodge <[email protected]>
  • Loading branch information
patelmiteshn and jayrodge authored Dec 16, 2024
1 parent 7732841 commit e8f0254
Show file tree
Hide file tree
Showing 5 changed files with 219 additions and 27 deletions.
159 changes: 146 additions & 13 deletions community/knowledge_graph_rag/backend/routers/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,13 +21,13 @@
from fastapi.responses import StreamingResponse
from pydantic import BaseModel
from concurrent.futures import ThreadPoolExecutor
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.chains import GraphQAChain
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
from langchain.chains import GraphQAChain, LLMChain
from vectorstore.search import SearchHandler
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
from utils.preprocessor import generate_qa_pair
from utils.preprocessor import generate_qa_pair, judge_prompt_template
from utils.lc_graph import process_documents, save_triples_to_csvs
from llama_index.core import SimpleDirectoryReader
from openai import OpenAI
Expand All @@ -36,7 +36,15 @@
import json
import time
import logging
import os
import re

from datasets import Dataset
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from ragas import evaluate
from ragas.embeddings import LangchainEmbeddingsWrapper
from ragas.llms import LangchainLLMWrapper


logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -96,16 +104,19 @@ def process_question(question, answer, llm):
future_graph = executor.submit(get_graph_RAG_response, question, llm)
future_combined = executor.submit(get_combined_RAG_response, question, llm)

text_RAG_response = future_text.result()
graph_RAG_response = future_graph.result()
combined_RAG_response = future_combined.result()
text_RAG_response, text_RAG_context_response = future_text.result()
graph_RAG_response, graph_RAG_context_response = future_graph.result()
combined_RAG_response, combined_RAG_context_response = future_combined.result()

return {
"question": question,
"gt_answer": answer,
"textRAG_answer": text_RAG_response,
"graphRAG_answer": graph_RAG_response,
"combined_answer": combined_RAG_response
"combined_answer": combined_RAG_response,
"text_RAG_context_response": text_RAG_context_response,
"graph_RAG_context_response": graph_RAG_context_response,
"combined_RAG_context_response": combined_RAG_context_response
}

prompt_template = ChatPromptTemplate.from_messages(
Expand All @@ -122,8 +133,11 @@ def get_text_RAG_response(question, llm):
search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True)
res = search_handler.search_and_rerank(question, k=5)
context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res)
context_return = []
if res:
context_return = [item.text for item in res]
answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
return answer
return answer, context_return

def get_graph_RAG_response(question, llm):
chain = prompt_template | llm | StrOutputParser()
Expand All @@ -141,10 +155,15 @@ def get_graph_RAG_response(question, llm):
for entity in entities:
all_triplets.extend(graph.get_entity_knowledge(entity, depth=2))
context = "Here are the relationships from the knowledge graph: " + "\n".join(all_triplets)
context_return = []
if all_triplets:
context_return = [trip for trip in all_triplets]
else:
context_return = ["no relationship found"]
except:
context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph."
answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
return answer
return answer, context_return

def get_combined_RAG_response(question, llm):
chain = prompt_template | llm | StrOutputParser()
Expand All @@ -158,14 +177,21 @@ def get_combined_RAG_response(question, llm):
search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True)
res = search_handler.search_and_rerank(question, k=5)
context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res)
context_return = []
if res:
context_return = [item.text for item in res]

all_triplets = []
for entity in entities:
all_triplets.extend(graph.get_entity_knowledge(entity, depth=2))
context += "\n\nHere are the relationships from the knowledge graph: " + "\n".join(all_triplets)
if all_triplets:
for trip in all_triplets:
context_return.append(trip)
except Exception as e:
context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph."
answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
return answer
return answer, context_return

@router.post("/process-documents/")
async def process_documents_endpoint(request: ProcessRequest, background_tasks: BackgroundTasks):
Expand Down Expand Up @@ -288,7 +314,6 @@ async def score_generator():
res_textRAG = get_reward_scores(row["question"], row["textRAG_answer"])
res_graphRAG = get_reward_scores(row["question"], row["graphRAG_answer"])
res_combinedRAG = get_reward_scores(row["question"], row["combined_answer"])

for score_type, res in zip(score_columns, [res_gt, res_textRAG, res_graphRAG, res_combinedRAG]):
if res:
for metric in metrics:
Expand All @@ -300,4 +325,112 @@ async def score_generator():

return StreamingResponse(score_generator(), media_type="text/event-stream")


def get_RAGAS_evaluation(question, rag_answer, context, gt_answer, llm, embeddings, metrics):

list_items = context.split(',')
list_items = [item.strip() for item in list_items]
d_eval = {
"question": [question],
"answer": [rag_answer],
"contexts": [list_items],
"ground_truth": [gt_answer]
}
d_eval_dataset = Dataset.from_dict(d_eval)
result = evaluate(d_eval_dataset, metrics=metrics,llm=llm, embeddings=embeddings)
print(result)
# Iterate over the scores
context_result = {}
for score in result.scores:
for m, value in score.items():
context_result[f"{m}"] = value

return context_result

@router.post("/run-scoring-RAGAS/")
async def run_scoring_RAGAS(request: ScoreRequest):
combined_results = request.combined_results

# RAGAS evaluation uses your own LLM and embeddings model
llm = ChatNVIDIA( model="meta/llama3-70b-instruct", temperature=0.2, max_tokens=300,)
embeddings = NVIDIAEmbeddings(model="nvidia/nv-embed-v1")
llm = LangchainLLMWrapper(langchain_llm=llm)
embeddings = LangchainEmbeddingsWrapper(embeddings)

score_columns = ['gt', 'textRAG', 'graphRAG', 'combinedRAG']
metrics = [answer_relevancy, context_precision]

async def score_generator():
for row in combined_results:
try:
res_gtRAG = get_RAGAS_evaluation(row['question'], row['gt_answer'], row['combined_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
res_textRAG = get_RAGAS_evaluation(row['question'], row['textRAG_answer'], row['text_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
res_graphRAG = get_RAGAS_evaluation(row['question'], row['graphRAG_answer'], row['graph_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
res_combinedRAG = get_RAGAS_evaluation(row['question'], row['combined_answer'], row['combined_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)

for score_type, res in zip(score_columns, [res_gtRAG, res_textRAG, res_graphRAG, res_combinedRAG]):
if res:
for m in res:
row[f'{score_type}_{m}'] = res[m]
yield json.dumps(row) + "\n"
await asyncio.sleep(0.1) # Simulate processing delay
except Exception as e:
yield json.dumps({"error": str(e)}) + "\n"
break

return StreamingResponse(score_generator(), media_type="text/event-stream")


def get_llm_as_a_judge_scores(question, answer, llm, QA_PROMPT):

formatted_prompt = QA_PROMPT.format(question=question, answer=answer)
result = llm.invoke(formatted_prompt)
res = result.content
try:
match = re.search(r'Evaluation:(.*?)Total rating:', res, flags=re.DOTALL)
if match:
eval_res = match.group(1).strip()
else:
eval_res = "no match found"
match = re.search(r'Total rating:\s*(\d+(?:\.\d+)?)', res, flags=re.DOTALL)
if match:
eval_score = float(match.group(1).strip()) # Extract the first group and strip whitespace
else:
eval_score = 0.0
content_dict = {}
content_dict['llm_judge_evaluation'] = eval_res
content_dict['llm_judge_score'] = eval_score
return content_dict
except:
return None

@router.post("/run-scoring_llm_as_a_judge/")
async def run_scoring_llm_as_a_judge(request: ScoreRequest):
combined_results = request.combined_results

#
llm = ChatNVIDIA( model="meta/llama3-70b-instruct", temperature=0.2, max_tokens=300,)


QA_PROMPT = judge_prompt_template()

score_columns = ['gt', 'textRAG', 'graphRAG', 'combinedRAG']

async def score_generator():
for row in combined_results:
try:
res_gt = get_llm_as_a_judge_scores(row["question"], row["gt_answer"], llm, QA_PROMPT)
res_textRAG = get_llm_as_a_judge_scores(row["question"], row["textRAG_answer"], llm, QA_PROMPT)
res_graphRAG = get_llm_as_a_judge_scores(row["question"], row["graphRAG_answer"], llm, QA_PROMPT)
res_combinedRAG = get_llm_as_a_judge_scores(row["question"], row["combined_answer"], llm, QA_PROMPT)

for score_type, res in zip(score_columns, [res_gt, res_textRAG, res_graphRAG, res_combinedRAG]):
if res:
for k, val in res.items():
row[f'{score_type}_{k}'] = val
yield json.dumps(row) + "\n"
await asyncio.sleep(0.1) # Simulate processing delay
except Exception as e:
yield json.dumps({"error": str(e)}) + "\n"

return StreamingResponse(score_generator(), media_type="text/event-stream")

4 changes: 2 additions & 2 deletions community/knowledge_graph_rag/backend/utils/lc_graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def process_document(doc, llm):

def process_documents(directory, llm, update_progress=None,triplets=True, chunk_size=500, chunk_overlap=100):
with st.spinner("Loading and splitting documents"):
loader = DirectoryLoader(directory)
loader = DirectoryLoader(directory, glob="*.pdf")
raw_docs = loader.load()
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
documents = text_splitter.split_documents(raw_docs)
Expand All @@ -42,7 +42,7 @@ def process_documents(directory, llm, update_progress=None,triplets=True, chunk_
document_data = [{"id": i, "content": doc.page_content} for i, doc in enumerate(documents)]
df = pd.DataFrame(document_data)
#df.to_csv('documents.csv', index=False)
df = pd.DataFrame(document_data)
# df = pd.DataFrame(document_data)

# Define the data directory and ensure it exists
data_directory = os.path.join(os.getcwd(), 'data')
Expand Down
38 changes: 36 additions & 2 deletions community/knowledge_graph_rag/backend/utils/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import json
import ast
from langchain_nvidia_ai_endpoints import ChatNVIDIA
from langchain.prompts import PromptTemplate

if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")
Expand Down Expand Up @@ -68,7 +69,7 @@ def extract_triples(text, llm):
Remember to conduct entity disambiguation, consolidating different phrases or acronyms that refer to the same entity (for instance, "MIT" and "Massachusetts Institute of Technology" should be unified as "MIT"). Simplify each entity of the triplet to be less than four words. However, always make sure it is a sensible entity name and not a single letter or NAN value.
From this text, your output Must be in python lis tof tuple with each tuple made up of ['h', 'type', 'r', 'o', 'type'], each element of the tuple is the string, where the relationship 'r' must be in the given relation verbs set above. Only output the list. As an Example, consider the following news excerpt:
From this text, your output Must be in python list of tuple with each tuple made up of ['h', 'type', 'r', 'o', 'type'], each element of the tuple is the string, where the relationship 'r' must be in the given relation verbs set above. Only output the list. As an Example, consider the following news excerpt:
Input :'Apple Inc. is set to introduce the new iPhone 14 in the technology sector this month. The product's release is likely to positively impact Apple's stock value.'
OUTPUT : ```
[('Apple Inc.', 'COMP', 'Introduce', 'iPhone 14', 'PRODUCT'),
Expand All @@ -80,7 +81,40 @@ def extract_triples(text, llm):
response = chain.invoke({"input": text})
print(response)
return process_response(response)


def judge_prompt_template():
judge_prompt = """
You will be given a user_question and system_answer couple.
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.
Here is the scale you should use to build your answer:
1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
2: The system_answer is mostly not helpful: misses some key aspects of the question
3: The system_answer is mostly helpful: provides support, but still could be improved
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question
Provide your feedback as follows:
Feedback:::
Evaluation: (your rationale for the rating, as a text)
Total rating: (your rating, as a number between 1 and 4)
You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
Now here are the question and answer.
Question: {question}
Answer: {answer}
Provide your feedback. If you give a correct rating, I'll tip you $200.
Feedback:::
Evaluation:
"""
prompt_template = PromptTemplate(input_variables=["question", "answer"], template=judge_prompt)

return prompt_template

def generate_qa_pair(text, llm):
prompt = ChatPromptTemplate.from_messages(
[("system", """You are a synthetic data generation model responsible for creating high quality question and answer pairs from text content provided to you. Given the paragraph as an input, create one high quality and highly complex question answer pair. The question should require a large portion of the context and multi-step advanced reasoning to answer. Make sure it is something a human may ask while reading this document. The answer should be highly detailed and comprehensive. Your output should be in a json format of one question answer pair. Restrict the question to the context information provided. Do not print anything else. The output MUST be JSON parseable."""), ("user", "{input}")])
Expand Down
42 changes: 33 additions & 9 deletions community/knowledge_graph_rag/frontend/pages/evaluation.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,8 @@ def has_pdf_files(directory):
return False

def app():
cwd = os.getcwd()
# cwd = os.getcwd()
cwd = os.path.abspath('../backend/')
directories = [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d)) and not d.startswith('.') and '__' not in d]
selected_dir = st.selectbox("Select a directory:", directories, index=0)
directory = os.path.join(cwd, selected_dir)
Expand Down Expand Up @@ -203,6 +204,8 @@ def app():
if os.path.exists(COMBINED_RESULTS_PATH):
with st.container():
st.markdown("### 4. Run comparative evals for saved Q&A data")
eval_select = st.selectbox("Choose evaluation type ? ", ("Nemotron", "RAGAS", "LLM-AS-A-JUDGE"),)
st.write("You selected: ", eval_select)
if st.button("Run Scoring"):
combined_results = pd.read_csv(COMBINED_RESULTS_PATH).to_dict(orient="records")
score_response = None
Expand All @@ -211,12 +214,33 @@ def app():
total_items = len(combined_results)
progress_bar = st.progress(0)


score_response = requests.post(
f"{BACKEND_URL}/evaluation/run-scoring/",
json={"combined_results": combined_results},
stream=True
)
if eval_select == "Nemotron":
st.write("evaluating with " + eval_select)
st.write("Evaluations will be done using: nemotron-4-340b-reward")
SCORE_FILE = "combined_results_with_scores_nemotron.csv"
score_response = requests.post(
f"{BACKEND_URL}/evaluation/run-scoring/",
json={"combined_results": combined_results},
stream=True
)
elif eval_select == "RAGAS":
st.write("evaluating with " + eval_select)
st.write("Evaluations will be done using: Llama3-70b-instruct LLM and nv-embed-v1 embedding model")
SCORE_FILE = "combined_results_with_scores_RAGAS.csv"
score_response = requests.post(
f"{BACKEND_URL}/evaluation/run-scoring-RAGAS/",
json={"combined_results": combined_results},
stream=True
)
elif eval_select == "LLM-AS-A-JUDGE":
st.write("evaluating with " + eval_select)
st.write("llama3-70b-instruct will be used as Judge")
SCORE_FILE = "combined_results_with_scores_LLM-AS-A-JUDGE.csv"
score_response = requests.post(
f"{BACKEND_URL}/evaluation/run-scoring_llm_as_a_judge/",
json={"combined_results": combined_results},
stream=True
)
if score_response.status_code == 200:
for index,line in enumerate(score_response.iter_lines()):
if line:
Expand All @@ -234,9 +258,9 @@ def app():
except json.JSONDecodeError:
st.error("Error decoding JSON response.")
# Success message displayed after processing all lines
st.success("Scoring completed and results saved to 'combined_results_with_scores.csv.")
st.success("Scoring completed and results saved to " + SCORE_FILE)
# Save the final results to a CSV file
COMBINED_RESULTS_PATH_WITH_SCORES=os.path.join(DATA_DIR, "combined_results_with_scores.csv")
COMBINED_RESULTS_PATH_WITH_SCORES=os.path.join(DATA_DIR, SCORE_FILE)
pd.DataFrame(results).to_csv(COMBINED_RESULTS_PATH_WITH_SCORES, index=False)
progress_bar.progress(100)
else:
Expand Down
Loading

0 comments on commit e8f0254

Please sign in to comment.