Skip to content

Commit e8f0254

Browse files
RAGAS and LLM as a Judge evaluation (#256)
* added notebook that showcases copilot application suing NIM, embedding model and FAISS VDB. All these tools are integrated using Langchain plugins. The notebook also has a Gradio-based UI to interact with the application * added images for the notebook and minor tweaks in the notebook * typo correction * Update langchain_copilot_with_NIM_HF_FAISS_deployed_locally.ipynb * Update langchain_copilot_with_NIM_HF_FAISS_deployed_locally.ipynb corrected typo * file management: moved notebooks under llm_video_series * changed one dependencies * added image files for notebook * removed images not used * removed auto generated folders and README.md * added RAGAS evaluation. (1) Added changes to save top-n context pulled by retriever that is needed for downstream RAGAS evaluation. (2) modified UI to pick which evalutaion to use from drop down * Added LLM as a judge to the evaluation * added LLM as a judge, argument to read only pdf files from the folder, RAGAS evaluation --------- Co-authored-by: Jay Rodge <[email protected]>
1 parent 7732841 commit e8f0254

File tree

5 files changed

+219
-27
lines changed

5 files changed

+219
-27
lines changed

community/knowledge_graph_rag/backend/routers/evaluation.py

Lines changed: 146 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -21,13 +21,13 @@
2121
from fastapi.responses import StreamingResponse
2222
from pydantic import BaseModel
2323
from concurrent.futures import ThreadPoolExecutor
24-
from langchain_nvidia_ai_endpoints import ChatNVIDIA
25-
from langchain.chains import GraphQAChain
24+
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
25+
from langchain.chains import GraphQAChain, LLMChain
2626
from vectorstore.search import SearchHandler
2727
from langchain_core.output_parsers import StrOutputParser
2828
from langchain_core.prompts import ChatPromptTemplate
2929
from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
30-
from utils.preprocessor import generate_qa_pair
30+
from utils.preprocessor import generate_qa_pair, judge_prompt_template
3131
from utils.lc_graph import process_documents, save_triples_to_csvs
3232
from llama_index.core import SimpleDirectoryReader
3333
from openai import OpenAI
@@ -36,7 +36,15 @@
3636
import json
3737
import time
3838
import logging
39-
import os
39+
import re
40+
41+
from datasets import Dataset
42+
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
43+
from ragas import evaluate
44+
from ragas.embeddings import LangchainEmbeddingsWrapper
45+
from ragas.llms import LangchainLLMWrapper
46+
47+
4048
logging.basicConfig(level=logging.INFO)
4149
logger = logging.getLogger(__name__)
4250

@@ -96,16 +104,19 @@ def process_question(question, answer, llm):
96104
future_graph = executor.submit(get_graph_RAG_response, question, llm)
97105
future_combined = executor.submit(get_combined_RAG_response, question, llm)
98106

99-
text_RAG_response = future_text.result()
100-
graph_RAG_response = future_graph.result()
101-
combined_RAG_response = future_combined.result()
107+
text_RAG_response, text_RAG_context_response = future_text.result()
108+
graph_RAG_response, graph_RAG_context_response = future_graph.result()
109+
combined_RAG_response, combined_RAG_context_response = future_combined.result()
102110

103111
return {
104112
"question": question,
105113
"gt_answer": answer,
106114
"textRAG_answer": text_RAG_response,
107115
"graphRAG_answer": graph_RAG_response,
108-
"combined_answer": combined_RAG_response
116+
"combined_answer": combined_RAG_response,
117+
"text_RAG_context_response": text_RAG_context_response,
118+
"graph_RAG_context_response": graph_RAG_context_response,
119+
"combined_RAG_context_response": combined_RAG_context_response
109120
}
110121

111122
prompt_template = ChatPromptTemplate.from_messages(
@@ -122,8 +133,11 @@ def get_text_RAG_response(question, llm):
122133
search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True)
123134
res = search_handler.search_and_rerank(question, k=5)
124135
context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res)
136+
context_return = []
137+
if res:
138+
context_return = [item.text for item in res]
125139
answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
126-
return answer
140+
return answer, context_return
127141

128142
def get_graph_RAG_response(question, llm):
129143
chain = prompt_template | llm | StrOutputParser()
@@ -141,10 +155,15 @@ def get_graph_RAG_response(question, llm):
141155
for entity in entities:
142156
all_triplets.extend(graph.get_entity_knowledge(entity, depth=2))
143157
context = "Here are the relationships from the knowledge graph: " + "\n".join(all_triplets)
158+
context_return = []
159+
if all_triplets:
160+
context_return = [trip for trip in all_triplets]
161+
else:
162+
context_return = ["no relationship found"]
144163
except:
145164
context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph."
146165
answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
147-
return answer
166+
return answer, context_return
148167

149168
def get_combined_RAG_response(question, llm):
150169
chain = prompt_template | llm | StrOutputParser()
@@ -158,14 +177,21 @@ def get_combined_RAG_response(question, llm):
158177
search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True)
159178
res = search_handler.search_and_rerank(question, k=5)
160179
context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res)
180+
context_return = []
181+
if res:
182+
context_return = [item.text for item in res]
183+
161184
all_triplets = []
162185
for entity in entities:
163186
all_triplets.extend(graph.get_entity_knowledge(entity, depth=2))
164187
context += "\n\nHere are the relationships from the knowledge graph: " + "\n".join(all_triplets)
188+
if all_triplets:
189+
for trip in all_triplets:
190+
context_return.append(trip)
165191
except Exception as e:
166192
context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph."
167193
answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
168-
return answer
194+
return answer, context_return
169195

170196
@router.post("/process-documents/")
171197
async def process_documents_endpoint(request: ProcessRequest, background_tasks: BackgroundTasks):
@@ -288,7 +314,6 @@ async def score_generator():
288314
res_textRAG = get_reward_scores(row["question"], row["textRAG_answer"])
289315
res_graphRAG = get_reward_scores(row["question"], row["graphRAG_answer"])
290316
res_combinedRAG = get_reward_scores(row["question"], row["combined_answer"])
291-
292317
for score_type, res in zip(score_columns, [res_gt, res_textRAG, res_graphRAG, res_combinedRAG]):
293318
if res:
294319
for metric in metrics:
@@ -300,4 +325,112 @@ async def score_generator():
300325

301326
return StreamingResponse(score_generator(), media_type="text/event-stream")
302327

303-
328+
def get_RAGAS_evaluation(question, rag_answer, context, gt_answer, llm, embeddings, metrics):
329+
330+
list_items = context.split(',')
331+
list_items = [item.strip() for item in list_items]
332+
d_eval = {
333+
"question": [question],
334+
"answer": [rag_answer],
335+
"contexts": [list_items],
336+
"ground_truth": [gt_answer]
337+
}
338+
d_eval_dataset = Dataset.from_dict(d_eval)
339+
result = evaluate(d_eval_dataset, metrics=metrics,llm=llm, embeddings=embeddings)
340+
print(result)
341+
# Iterate over the scores
342+
context_result = {}
343+
for score in result.scores:
344+
for m, value in score.items():
345+
context_result[f"{m}"] = value
346+
347+
return context_result
348+
349+
@router.post("/run-scoring-RAGAS/")
350+
async def run_scoring_RAGAS(request: ScoreRequest):
351+
combined_results = request.combined_results
352+
353+
# RAGAS evaluation uses your own LLM and embeddings model
354+
llm = ChatNVIDIA( model="meta/llama3-70b-instruct", temperature=0.2, max_tokens=300,)
355+
embeddings = NVIDIAEmbeddings(model="nvidia/nv-embed-v1")
356+
llm = LangchainLLMWrapper(langchain_llm=llm)
357+
embeddings = LangchainEmbeddingsWrapper(embeddings)
358+
359+
score_columns = ['gt', 'textRAG', 'graphRAG', 'combinedRAG']
360+
metrics = [answer_relevancy, context_precision]
361+
362+
async def score_generator():
363+
for row in combined_results:
364+
try:
365+
res_gtRAG = get_RAGAS_evaluation(row['question'], row['gt_answer'], row['combined_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
366+
res_textRAG = get_RAGAS_evaluation(row['question'], row['textRAG_answer'], row['text_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
367+
res_graphRAG = get_RAGAS_evaluation(row['question'], row['graphRAG_answer'], row['graph_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
368+
res_combinedRAG = get_RAGAS_evaluation(row['question'], row['combined_answer'], row['combined_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
369+
370+
for score_type, res in zip(score_columns, [res_gtRAG, res_textRAG, res_graphRAG, res_combinedRAG]):
371+
if res:
372+
for m in res:
373+
row[f'{score_type}_{m}'] = res[m]
374+
yield json.dumps(row) + "\n"
375+
await asyncio.sleep(0.1) # Simulate processing delay
376+
except Exception as e:
377+
yield json.dumps({"error": str(e)}) + "\n"
378+
break
379+
380+
return StreamingResponse(score_generator(), media_type="text/event-stream")
381+
382+
383+
def get_llm_as_a_judge_scores(question, answer, llm, QA_PROMPT):
384+
385+
formatted_prompt = QA_PROMPT.format(question=question, answer=answer)
386+
result = llm.invoke(formatted_prompt)
387+
res = result.content
388+
try:
389+
match = re.search(r'Evaluation:(.*?)Total rating:', res, flags=re.DOTALL)
390+
if match:
391+
eval_res = match.group(1).strip()
392+
else:
393+
eval_res = "no match found"
394+
match = re.search(r'Total rating:\s*(\d+(?:\.\d+)?)', res, flags=re.DOTALL)
395+
if match:
396+
eval_score = float(match.group(1).strip()) # Extract the first group and strip whitespace
397+
else:
398+
eval_score = 0.0
399+
content_dict = {}
400+
content_dict['llm_judge_evaluation'] = eval_res
401+
content_dict['llm_judge_score'] = eval_score
402+
return content_dict
403+
except:
404+
return None
405+
406+
@router.post("/run-scoring_llm_as_a_judge/")
407+
async def run_scoring_llm_as_a_judge(request: ScoreRequest):
408+
combined_results = request.combined_results
409+
410+
#
411+
llm = ChatNVIDIA( model="meta/llama3-70b-instruct", temperature=0.2, max_tokens=300,)
412+
413+
414+
QA_PROMPT = judge_prompt_template()
415+
416+
score_columns = ['gt', 'textRAG', 'graphRAG', 'combinedRAG']
417+
418+
async def score_generator():
419+
for row in combined_results:
420+
try:
421+
res_gt = get_llm_as_a_judge_scores(row["question"], row["gt_answer"], llm, QA_PROMPT)
422+
res_textRAG = get_llm_as_a_judge_scores(row["question"], row["textRAG_answer"], llm, QA_PROMPT)
423+
res_graphRAG = get_llm_as_a_judge_scores(row["question"], row["graphRAG_answer"], llm, QA_PROMPT)
424+
res_combinedRAG = get_llm_as_a_judge_scores(row["question"], row["combined_answer"], llm, QA_PROMPT)
425+
426+
for score_type, res in zip(score_columns, [res_gt, res_textRAG, res_graphRAG, res_combinedRAG]):
427+
if res:
428+
for k, val in res.items():
429+
row[f'{score_type}_{k}'] = val
430+
yield json.dumps(row) + "\n"
431+
await asyncio.sleep(0.1) # Simulate processing delay
432+
except Exception as e:
433+
yield json.dumps({"error": str(e)}) + "\n"
434+
435+
return StreamingResponse(score_generator(), media_type="text/event-stream")
436+

community/knowledge_graph_rag/backend/utils/lc_graph.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ def process_document(doc, llm):
3333

3434
def process_documents(directory, llm, update_progress=None,triplets=True, chunk_size=500, chunk_overlap=100):
3535
with st.spinner("Loading and splitting documents"):
36-
loader = DirectoryLoader(directory)
36+
loader = DirectoryLoader(directory, glob="*.pdf")
3737
raw_docs = loader.load()
3838
text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
3939
documents = text_splitter.split_documents(raw_docs)
@@ -42,7 +42,7 @@ def process_documents(directory, llm, update_progress=None,triplets=True, chunk_
4242
document_data = [{"id": i, "content": doc.page_content} for i, doc in enumerate(documents)]
4343
df = pd.DataFrame(document_data)
4444
#df.to_csv('documents.csv', index=False)
45-
df = pd.DataFrame(document_data)
45+
# df = pd.DataFrame(document_data)
4646

4747
# Define the data directory and ensure it exists
4848
data_directory = os.path.join(os.getcwd(), 'data')

community/knowledge_graph_rag/backend/utils/preprocessor.py

Lines changed: 36 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,7 @@
1818
import json
1919
import ast
2020
from langchain_nvidia_ai_endpoints import ChatNVIDIA
21+
from langchain.prompts import PromptTemplate
2122

2223
if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
2324
nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")
@@ -68,7 +69,7 @@ def extract_triples(text, llm):
6869
6970
Remember to conduct entity disambiguation, consolidating different phrases or acronyms that refer to the same entity (for instance, "MIT" and "Massachusetts Institute of Technology" should be unified as "MIT"). Simplify each entity of the triplet to be less than four words. However, always make sure it is a sensible entity name and not a single letter or NAN value.
7071
71-
From this text, your output Must be in python lis tof tuple with each tuple made up of ['h', 'type', 'r', 'o', 'type'], each element of the tuple is the string, where the relationship 'r' must be in the given relation verbs set above. Only output the list. As an Example, consider the following news excerpt:
72+
From this text, your output Must be in python list of tuple with each tuple made up of ['h', 'type', 'r', 'o', 'type'], each element of the tuple is the string, where the relationship 'r' must be in the given relation verbs set above. Only output the list. As an Example, consider the following news excerpt:
7273
Input :'Apple Inc. is set to introduce the new iPhone 14 in the technology sector this month. The product's release is likely to positively impact Apple's stock value.'
7374
OUTPUT : ```
7475
[('Apple Inc.', 'COMP', 'Introduce', 'iPhone 14', 'PRODUCT'),
@@ -80,7 +81,40 @@ def extract_triples(text, llm):
8081
response = chain.invoke({"input": text})
8182
print(response)
8283
return process_response(response)
83-
84+
85+
def judge_prompt_template():
86+
judge_prompt = """
87+
You will be given a user_question and system_answer couple.
88+
Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
89+
Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.
90+
91+
Here is the scale you should use to build your answer:
92+
1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
93+
2: The system_answer is mostly not helpful: misses some key aspects of the question
94+
3: The system_answer is mostly helpful: provides support, but still could be improved
95+
4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question
96+
97+
Provide your feedback as follows:
98+
99+
Feedback:::
100+
Evaluation: (your rationale for the rating, as a text)
101+
Total rating: (your rating, as a number between 1 and 4)
102+
103+
You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
104+
105+
Now here are the question and answer.
106+
107+
Question: {question}
108+
Answer: {answer}
109+
110+
Provide your feedback. If you give a correct rating, I'll tip you $200.
111+
Feedback:::
112+
Evaluation:
113+
"""
114+
prompt_template = PromptTemplate(input_variables=["question", "answer"], template=judge_prompt)
115+
116+
return prompt_template
117+
84118
def generate_qa_pair(text, llm):
85119
prompt = ChatPromptTemplate.from_messages(
86120
[("system", """You are a synthetic data generation model responsible for creating high quality question and answer pairs from text content provided to you. Given the paragraph as an input, create one high quality and highly complex question answer pair. The question should require a large portion of the context and multi-step advanced reasoning to answer. Make sure it is something a human may ask while reading this document. The answer should be highly detailed and comprehensive. Your output should be in a json format of one question answer pair. Restrict the question to the context information provided. Do not print anything else. The output MUST be JSON parseable."""), ("user", "{input}")])

community/knowledge_graph_rag/frontend/pages/evaluation.py

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,8 @@ def has_pdf_files(directory):
6767
return False
6868

6969
def app():
70-
cwd = os.getcwd()
70+
# cwd = os.getcwd()
71+
cwd = os.path.abspath('../backend/')
7172
directories = [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d)) and not d.startswith('.') and '__' not in d]
7273
selected_dir = st.selectbox("Select a directory:", directories, index=0)
7374
directory = os.path.join(cwd, selected_dir)
@@ -203,6 +204,8 @@ def app():
203204
if os.path.exists(COMBINED_RESULTS_PATH):
204205
with st.container():
205206
st.markdown("### 4. Run comparative evals for saved Q&A data")
207+
eval_select = st.selectbox("Choose evaluation type ? ", ("Nemotron", "RAGAS", "LLM-AS-A-JUDGE"),)
208+
st.write("You selected: ", eval_select)
206209
if st.button("Run Scoring"):
207210
combined_results = pd.read_csv(COMBINED_RESULTS_PATH).to_dict(orient="records")
208211
score_response = None
@@ -211,12 +214,33 @@ def app():
211214
total_items = len(combined_results)
212215
progress_bar = st.progress(0)
213216

214-
215-
score_response = requests.post(
216-
f"{BACKEND_URL}/evaluation/run-scoring/",
217-
json={"combined_results": combined_results},
218-
stream=True
219-
)
217+
if eval_select == "Nemotron":
218+
st.write("evaluating with " + eval_select)
219+
st.write("Evaluations will be done using: nemotron-4-340b-reward")
220+
SCORE_FILE = "combined_results_with_scores_nemotron.csv"
221+
score_response = requests.post(
222+
f"{BACKEND_URL}/evaluation/run-scoring/",
223+
json={"combined_results": combined_results},
224+
stream=True
225+
)
226+
elif eval_select == "RAGAS":
227+
st.write("evaluating with " + eval_select)
228+
st.write("Evaluations will be done using: Llama3-70b-instruct LLM and nv-embed-v1 embedding model")
229+
SCORE_FILE = "combined_results_with_scores_RAGAS.csv"
230+
score_response = requests.post(
231+
f"{BACKEND_URL}/evaluation/run-scoring-RAGAS/",
232+
json={"combined_results": combined_results},
233+
stream=True
234+
)
235+
elif eval_select == "LLM-AS-A-JUDGE":
236+
st.write("evaluating with " + eval_select)
237+
st.write("llama3-70b-instruct will be used as Judge")
238+
SCORE_FILE = "combined_results_with_scores_LLM-AS-A-JUDGE.csv"
239+
score_response = requests.post(
240+
f"{BACKEND_URL}/evaluation/run-scoring_llm_as_a_judge/",
241+
json={"combined_results": combined_results},
242+
stream=True
243+
)
220244
if score_response.status_code == 200:
221245
for index,line in enumerate(score_response.iter_lines()):
222246
if line:
@@ -234,9 +258,9 @@ def app():
234258
except json.JSONDecodeError:
235259
st.error("Error decoding JSON response.")
236260
# Success message displayed after processing all lines
237-
st.success("Scoring completed and results saved to 'combined_results_with_scores.csv.")
261+
st.success("Scoring completed and results saved to " + SCORE_FILE)
238262
# Save the final results to a CSV file
239-
COMBINED_RESULTS_PATH_WITH_SCORES=os.path.join(DATA_DIR, "combined_results_with_scores.csv")
263+
COMBINED_RESULTS_PATH_WITH_SCORES=os.path.join(DATA_DIR, SCORE_FILE)
240264
pd.DataFrame(results).to_csv(COMBINED_RESULTS_PATH_WITH_SCORES, index=False)
241265
progress_bar.progress(100)
242266
else:

0 commit comments

Comments
 (0)