RAGAS and LLM as a Judge evaluation (#256)

patelmiteshn · jayrodge · web-flow · commit e8f0254d24eb · 2024-12-16T12:43:36.000-08:00
* added notebook that showcases copilot application suing NIM, embedding model and FAISS VDB. All these tools are integrated using Langchain plugins. The notebook also has a Gradio-based UI to interact with the application

* added images for the notebook and minor tweaks in the notebook

* typo correction

* Update langchain_copilot_with_NIM_HF_FAISS_deployed_locally.ipynb

* Update langchain_copilot_with_NIM_HF_FAISS_deployed_locally.ipynb

corrected typo

* file management: moved notebooks under llm_video_series

* changed one dependencies

* added image files for notebook

* removed images not used

* removed auto generated folders and README.md

* added RAGAS evaluation. (1) Added changes to save top-n context pulled by retriever that is needed for downstream RAGAS evaluation. (2) modified UI to pick which evalutaion to use from drop down

* Added LLM as a judge to the evaluation

* added LLM as a judge, argument to read only pdf files from the folder, RAGAS evaluation

---------

Co-authored-by: Jay Rodge &lt;jayrodge@live.com&gt;
diff --git a/community/knowledge_graph_rag/backend/routers/evaluation.py b/community/knowledge_graph_rag/backend/routers/evaluation.py
@@ -21,13 +21,13 @@
 from fastapi.responses import StreamingResponse
 from pydantic import BaseModel
 from concurrent.futures import ThreadPoolExecutor
-from langchain_nvidia_ai_endpoints import ChatNVIDIA
-from langchain.chains import GraphQAChain
+from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings
+from langchain.chains import GraphQAChain, LLMChain
 from vectorstore.search import SearchHandler
 from langchain_core.output_parsers import StrOutputParser
 from langchain_core.prompts import ChatPromptTemplate
 from langchain_community.graphs.networkx_graph import NetworkxEntityGraph
-from utils.preprocessor import generate_qa_pair
+from utils.preprocessor import generate_qa_pair, judge_prompt_template
 from utils.lc_graph import process_documents, save_triples_to_csvs
 from llama_index.core import SimpleDirectoryReader
 from openai import OpenAI
@@ -36,7 +36,15 @@
 import json
 import time
 import logging
-import os
+import re 
+
+from datasets import Dataset
+from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
+from ragas import evaluate
+from ragas.embeddings import LangchainEmbeddingsWrapper
+from ragas.llms import LangchainLLMWrapper
+
+
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
@@ -96,16 +104,19 @@ def process_question(question, answer, llm):
         future_graph = executor.submit(get_graph_RAG_response, question, llm)
         future_combined = executor.submit(get_combined_RAG_response, question, llm)
 
-        text_RAG_response = future_text.result()
-        graph_RAG_response = future_graph.result()
-        combined_RAG_response = future_combined.result()
+        text_RAG_response, text_RAG_context_response = future_text.result()
+        graph_RAG_response, graph_RAG_context_response = future_graph.result()
+        combined_RAG_response, combined_RAG_context_response = future_combined.result()
 
     return {
         "question": question,
         "gt_answer": answer,
         "textRAG_answer": text_RAG_response,
         "graphRAG_answer": graph_RAG_response,
-        "combined_answer": combined_RAG_response
+        "combined_answer": combined_RAG_response,
+        "text_RAG_context_response": text_RAG_context_response,
+        "graph_RAG_context_response": graph_RAG_context_response,
+        "combined_RAG_context_response": combined_RAG_context_response
     }
 
 prompt_template = ChatPromptTemplate.from_messages(
@@ -122,8 +133,11 @@ def get_text_RAG_response(question, llm):
     search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True)
     res = search_handler.search_and_rerank(question, k=5)
     context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res)
+    context_return = []
+    if res:
+        context_return = [item.text for item in res]
     answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
-    return answer
+    return answer, context_return
 
 def get_graph_RAG_response(question, llm):
     chain = prompt_template | llm | StrOutputParser()
@@ -141,10 +155,15 @@ def get_graph_RAG_response(question, llm):
         for entity in entities:
             all_triplets.extend(graph.get_entity_knowledge(entity, depth=2))
         context = "Here are the relationships from the knowledge graph: " + "\n".join(all_triplets)
+        context_return = []
+        if all_triplets:
+            context_return = [trip for trip in all_triplets]
+        else:
+            context_return = ["no relationship found"]
     except:
         context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph."
     answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
-    return answer
+    return answer, context_return
 
 def get_combined_RAG_response(question, llm):
     chain = prompt_template | llm | StrOutputParser()
@@ -158,14 +177,21 @@ def get_combined_RAG_response(question, llm):
         search_handler = SearchHandler("hybrid_demo3", use_bge_m3=True, use_reranker=True)
         res = search_handler.search_and_rerank(question, k=5)
         context = "Here are the relevant passages from the knowledge base: \n\n" + "\n".join(item.text for item in res)
+        context_return = []
+        if res:
+            context_return = [item.text for item in res]
+
         all_triplets = []
         for entity in entities:
             all_triplets.extend(graph.get_entity_knowledge(entity, depth=2))
         context += "\n\nHere are the relationships from the knowledge graph: " + "\n".join(all_triplets)
+        if all_triplets:
+            for trip in all_triplets:
+                context_return.append(trip)
     except Exception as e:
         context = "No graph triples were available to extract from the knowledge graph. Always provide a disclaimer if you know the answer to the user's question, since it is not grounded in the knowledge you are provided from the graph."
     answer = chain.invoke("Context: " + context + "\n\nUser query: " + question)
-    return answer
+    return answer, context_return
 
 @router.post("/process-documents/")
 async def process_documents_endpoint(request: ProcessRequest, background_tasks: BackgroundTasks):
@@ -288,7 +314,6 @@ async def score_generator():
                 res_textRAG = get_reward_scores(row["question"], row["textRAG_answer"])
                 res_graphRAG = get_reward_scores(row["question"], row["graphRAG_answer"])
                 res_combinedRAG = get_reward_scores(row["question"], row["combined_answer"])
-
                 for score_type, res in zip(score_columns, [res_gt, res_textRAG, res_graphRAG, res_combinedRAG]):
                      if res:
                         for metric in metrics:
@@ -300,4 +325,112 @@ async def score_generator():
 
     return StreamingResponse(score_generator(), media_type="text/event-stream")
 
-    
+def get_RAGAS_evaluation(question, rag_answer, context, gt_answer, llm, embeddings, metrics):
+    
+    list_items = context.split(',')
+    list_items = [item.strip() for item in list_items]
+    d_eval = {
+        "question": [question],
+        "answer": [rag_answer],
+        "contexts": [list_items],
+        "ground_truth": [gt_answer]
+    }
+    d_eval_dataset = Dataset.from_dict(d_eval)
+    result = evaluate(d_eval_dataset, metrics=metrics,llm=llm, embeddings=embeddings)
+    print(result)
+    # Iterate over the scores
+    context_result = {}
+    for score in result.scores:
+        for m, value in score.items():
+            context_result[f"{m}"] = value
+
+    return context_result
+
+@router.post("/run-scoring-RAGAS/")
+async def run_scoring_RAGAS(request: ScoreRequest):
+    combined_results = request.combined_results
+    
+    # RAGAS evaluation uses your own LLM and embeddings model
+    llm = ChatNVIDIA( model="meta/llama3-70b-instruct", temperature=0.2, max_tokens=300,)
+    embeddings = NVIDIAEmbeddings(model="nvidia/nv-embed-v1")
+    llm = LangchainLLMWrapper(langchain_llm=llm)
+    embeddings = LangchainEmbeddingsWrapper(embeddings)
+
+    score_columns = ['gt', 'textRAG', 'graphRAG', 'combinedRAG']
+    metrics = [answer_relevancy, context_precision]
+
+    async def score_generator():
+        for row in combined_results:
+            try:
+                res_gtRAG = get_RAGAS_evaluation(row['question'], row['gt_answer'], row['combined_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
+                res_textRAG = get_RAGAS_evaluation(row['question'], row['textRAG_answer'], row['text_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
+                res_graphRAG = get_RAGAS_evaluation(row['question'], row['graphRAG_answer'], row['graph_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
+                res_combinedRAG = get_RAGAS_evaluation(row['question'], row['combined_answer'], row['combined_RAG_context_response'], row['gt_answer'], llm, embeddings, metrics)
+
+                for score_type, res in zip(score_columns, [res_gtRAG, res_textRAG, res_graphRAG, res_combinedRAG]):
+                     if res:
+                        for m in res:
+                            row[f'{score_type}_{m}'] = res[m]
+                yield json.dumps(row) + "\n"
+                await asyncio.sleep(0.1)  # Simulate processing delay
+            except Exception as e:
+                yield json.dumps({"error": str(e)}) + "\n"
+            break
+
+    return StreamingResponse(score_generator(), media_type="text/event-stream")
+
+
+def get_llm_as_a_judge_scores(question, answer, llm, QA_PROMPT):
+    
+    formatted_prompt = QA_PROMPT.format(question=question, answer=answer)
+    result = llm.invoke(formatted_prompt)
+    res = result.content 
+    try:
+        match = re.search(r'Evaluation:(.*?)Total rating:', res, flags=re.DOTALL)
+        if match:
+            eval_res = match.group(1).strip()
+        else:
+            eval_res = "no match found"
+        match = re.search(r'Total rating:\s*(\d+(?:\.\d+)?)', res, flags=re.DOTALL)
+        if match:
+            eval_score = float(match.group(1).strip())  # Extract the first group and strip whitespace
+        else:
+            eval_score = 0.0
+        content_dict = {}
+        content_dict['llm_judge_evaluation'] = eval_res
+        content_dict['llm_judge_score'] = eval_score
+        return content_dict
+    except:
+        return None
+
+@router.post("/run-scoring_llm_as_a_judge/")
+async def run_scoring_llm_as_a_judge(request: ScoreRequest):
+    combined_results = request.combined_results
+    
+    #
+    llm = ChatNVIDIA( model="meta/llama3-70b-instruct", temperature=0.2, max_tokens=300,)
+    
+    
+    QA_PROMPT = judge_prompt_template()
+    
+    score_columns = ['gt', 'textRAG', 'graphRAG', 'combinedRAG']
+
+    async def score_generator():
+        for row in combined_results:
+            try:
+                res_gt = get_llm_as_a_judge_scores(row["question"], row["gt_answer"], llm, QA_PROMPT)
+                res_textRAG = get_llm_as_a_judge_scores(row["question"], row["textRAG_answer"], llm, QA_PROMPT)
+                res_graphRAG = get_llm_as_a_judge_scores(row["question"], row["graphRAG_answer"], llm, QA_PROMPT)
+                res_combinedRAG = get_llm_as_a_judge_scores(row["question"], row["combined_answer"], llm, QA_PROMPT)
+
+                for score_type, res in zip(score_columns, [res_gt, res_textRAG, res_graphRAG, res_combinedRAG]):
+                     if res:
+                        for k, val in res.items():
+                            row[f'{score_type}_{k}'] = val
+                yield json.dumps(row) + "\n"
+                await asyncio.sleep(0.1)  # Simulate processing delay
+            except Exception as e:
+                yield json.dumps({"error": str(e)}) + "\n"
+
+    return StreamingResponse(score_generator(), media_type="text/event-stream")
+                
diff --git a/community/knowledge_graph_rag/backend/utils/lc_graph.py b/community/knowledge_graph_rag/backend/utils/lc_graph.py
@@ -33,7 +33,7 @@ def process_document(doc, llm):
 
 def process_documents(directory, llm, update_progress=None,triplets=True, chunk_size=500, chunk_overlap=100):    
     with st.spinner("Loading and splitting documents"):
-        loader = DirectoryLoader(directory)
+        loader = DirectoryLoader(directory,  glob="*.pdf")
         raw_docs = loader.load()
         text_splitter = RecursiveCharacterTextSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap)
         documents = text_splitter.split_documents(raw_docs)
@@ -42,7 +42,7 @@ def process_documents(directory, llm, update_progress=None,triplets=True, chunk_
     document_data = [{"id": i, "content": doc.page_content} for i, doc in enumerate(documents)]
     df = pd.DataFrame(document_data)
     #df.to_csv('documents.csv', index=False)
-    df = pd.DataFrame(document_data)
+    # df = pd.DataFrame(document_data)
 
     # Define the data directory and ensure it exists
     data_directory = os.path.join(os.getcwd(), 'data')
diff --git a/community/knowledge_graph_rag/backend/utils/preprocessor.py b/community/knowledge_graph_rag/backend/utils/preprocessor.py
@@ -18,6 +18,7 @@
 import json
 import ast
 from langchain_nvidia_ai_endpoints import ChatNVIDIA
+from langchain.prompts import PromptTemplate
 
 if not os.environ.get("NVIDIA_API_KEY", "").startswith("nvapi-"):
     nvapi_key = getpass.getpass("Enter your NVIDIA API key: ")
@@ -68,7 +69,7 @@ def extract_triples(text, llm):
 
 Remember to conduct entity disambiguation, consolidating different phrases or acronyms that refer to the same entity (for instance, "MIT" and "Massachusetts Institute of Technology" should be unified as "MIT"). Simplify each entity of the triplet to be less than four words. However, always make sure it is a sensible entity name and not a single letter or NAN value.
 
-From this text, your output Must be in python lis tof tuple with each tuple made up of ['h', 'type', 'r', 'o', 'type'], each element of the tuple is the string, where the relationship 'r' must be in the given relation verbs set above. Only output the list. As an Example, consider the following news excerpt: 
+From this text, your output Must be in python list of tuple with each tuple made up of ['h', 'type', 'r', 'o', 'type'], each element of the tuple is the string, where the relationship 'r' must be in the given relation verbs set above. Only output the list. As an Example, consider the following news excerpt: 
                         Input :'Apple Inc. is set to introduce the new iPhone 14 in the technology sector this month. The product's release is likely to positively impact Apple's stock value.'
                         OUTPUT : ```
                             [('Apple Inc.', 'COMP', 'Introduce', 'iPhone 14', 'PRODUCT'),
@@ -80,7 +81,40 @@ def extract_triples(text, llm):
     response = chain.invoke({"input": text})
     print(response)
     return process_response(response)
-    
+
+def judge_prompt_template():
+    judge_prompt = """
+    You will be given a user_question and system_answer couple.
+    Your task is to provide a 'total rating' scoring how well the system_answer answers the user concerns expressed in the user_question.
+    Give your answer on a scale of 1 to 4, where 1 means that the system_answer is not helpful at all, and 4 means that the system_answer completely and helpfully addresses the user_question.
+
+    Here is the scale you should use to build your answer:
+    1: The system_answer is terrible: completely irrelevant to the question asked, or very partial
+    2: The system_answer is mostly not helpful: misses some key aspects of the question
+    3: The system_answer is mostly helpful: provides support, but still could be improved
+    4: The system_answer is excellent: relevant, direct, detailed, and addresses all the concerns raised in the question
+
+    Provide your feedback as follows:
+
+    Feedback:::
+    Evaluation: (your rationale for the rating, as a text)
+    Total rating: (your rating, as a number between 1 and 4)
+
+    You MUST provide values for 'Evaluation:' and 'Total rating:' in your answer.
+
+    Now here are the question and answer.
+
+    Question: {question}
+    Answer: {answer}
+
+    Provide your feedback. If you give a correct rating, I'll tip you $200.
+    Feedback:::
+    Evaluation:
+    """
+    prompt_template = PromptTemplate(input_variables=["question", "answer"], template=judge_prompt)
+
+    return prompt_template
+
 def generate_qa_pair(text, llm):
     prompt = ChatPromptTemplate.from_messages(
     [("system", """You are a synthetic data generation model responsible for creating high quality question and answer pairs from text content provided to you. Given the paragraph as an input, create one high quality and highly complex question answer pair. The question should require a large portion of the context and multi-step advanced reasoning to answer. Make sure it is something a human may ask while reading this document. The answer should be highly detailed and comprehensive. Your output should be in a json format of one question answer pair. Restrict the question to the context information provided. Do not print anything else. The output MUST be JSON parseable."""), ("user", "{input}")])
diff --git a/community/knowledge_graph_rag/frontend/pages/evaluation.py b/community/knowledge_graph_rag/frontend/pages/evaluation.py
@@ -67,7 +67,8 @@ def has_pdf_files(directory):
     return False
 
 def app():
-    cwd = os.getcwd()
+    # cwd = os.getcwd()
+    cwd = os.path.abspath('../backend/')
     directories = [d for d in os.listdir(cwd) if os.path.isdir(os.path.join(cwd, d)) and not d.startswith('.') and '__' not in d]
     selected_dir = st.selectbox("Select a directory:", directories, index=0)
     directory = os.path.join(cwd, selected_dir)
@@ -203,6 +204,8 @@ def app():
     if os.path.exists(COMBINED_RESULTS_PATH):
         with st.container():
             st.markdown("### 4. Run comparative evals for saved Q&A data")
+            eval_select = st.selectbox("Choose evaluation type ? ", ("Nemotron", "RAGAS", "LLM-AS-A-JUDGE"),)
+            st.write("You selected: ", eval_select)
             if st.button("Run Scoring"):
                 combined_results = pd.read_csv(COMBINED_RESULTS_PATH).to_dict(orient="records")
                 score_response = None
@@ -211,12 +214,33 @@ def app():
                 total_items = len(combined_results)
                 progress_bar = st.progress(0)
 
-
-                score_response = requests.post(
-                    f"{BACKEND_URL}/evaluation/run-scoring/",
-                    json={"combined_results": combined_results}, 
-                    stream=True
-                )
+                if eval_select == "Nemotron":
+                    st.write("evaluating with " + eval_select)
+                    st.write("Evaluations will be done using: nemotron-4-340b-reward")
+                    SCORE_FILE = "combined_results_with_scores_nemotron.csv"
+                    score_response = requests.post(
+                        f"{BACKEND_URL}/evaluation/run-scoring/",
+                        json={"combined_results": combined_results}, 
+                        stream=True
+                    )
+                elif eval_select == "RAGAS":
+                    st.write("evaluating with " + eval_select)
+                    st.write("Evaluations will be done using: Llama3-70b-instruct LLM and nv-embed-v1 embedding model")
+                    SCORE_FILE = "combined_results_with_scores_RAGAS.csv"
+                    score_response = requests.post(
+                        f"{BACKEND_URL}/evaluation/run-scoring-RAGAS/",
+                        json={"combined_results": combined_results}, 
+                        stream=True
+                    )
+                elif eval_select == "LLM-AS-A-JUDGE":
+                    st.write("evaluating with " + eval_select)
+                    st.write("llama3-70b-instruct will be used as Judge")
+                    SCORE_FILE = "combined_results_with_scores_LLM-AS-A-JUDGE.csv"
+                    score_response = requests.post(
+                        f"{BACKEND_URL}/evaluation/run-scoring_llm_as_a_judge/",
+                        json={"combined_results": combined_results}, 
+                        stream=True
+                    )
                 if score_response.status_code == 200:
                     for index,line in enumerate(score_response.iter_lines()):
                         if line:
@@ -234,9 +258,9 @@ def app():
                             except json.JSONDecodeError:
                                 st.error("Error decoding JSON response.")
                     # Success message displayed after processing all lines
-                    st.success("Scoring completed and results saved to 'combined_results_with_scores.csv.")
+                    st.success("Scoring completed and results saved to " + SCORE_FILE)
                     # Save the final results to a CSV file
-                    COMBINED_RESULTS_PATH_WITH_SCORES=os.path.join(DATA_DIR, "combined_results_with_scores.csv")
+                    COMBINED_RESULTS_PATH_WITH_SCORES=os.path.join(DATA_DIR, SCORE_FILE)
                     pd.DataFrame(results).to_csv(COMBINED_RESULTS_PATH_WITH_SCORES, index=False)
                     progress_bar.progress(100)
                 else:
diff --git a/community/knowledge_graph_rag/frontend/ui.py b/community/knowledge_graph_rag/frontend/ui.py