Fciannella fixed no system prompt nvidia vlm (#267)

* multimodal retrieval by fciannella * Fixed the images path in the README.md file * Changed the location of the repository * Removed non essential requirements * Added env Variable for NVIDIA Text model Removed comments from Gradio Interface Removed stats collection from Gradio interface. * fixed the system prompt issue with nv vlm
NVIDIA · Jan 24, 2025 · 5e2655f · 5e2655f
1 parent 12063d7
commit 5e2655f
Show file tree

Hide file tree

Showing 11 changed files with 282 additions and 112 deletions.
diff --git a/community/multimodal_retrieval/README.md b/community/multimodal_retrieval/README.md
@@ -30,7 +30,9 @@ You can also launch langgraph with the containers with `langgraph up`, in that c
 
 Run this command from the root of the repository (the one with the  `langgraph.json` and `docker-compose.yml` files)
 
-Install a venv:
+
+Install a venv (python >= 3.11 is required):
+
 
 ```shell
 python3 -m venv lg-venv
@@ -41,30 +43,38 @@ pip install -r requirements.txt
 
 ## Create the env files
 
-You need to create two .env files (one for the docker compose and one for the langgraph agent)
+
+You need to create two .env files (one for the docker compose and one for the langgraph agent).
+
+In the below we give the opportunity to use an NVIDIA text model for the pure text based tasks.
+
+For the Langgraph agent we leave the LLM model to be openai as at the moment it is providing better results with tools binding.
+
 
 ### .env
 
 Create a .env file in the root directory of this repository (the one with the `langgraph.json` and `docker-compose.yml` files)
 
 ```shell
+# .env
 MONGO_INITDB_ROOT_USERNAME=admin
 MONGO_INITDB_ROOT_PASSWORD=secret
-MONGO_HOST=mongodb
+MONGO_HOST=localhost
 MONGO_PORT=27017
+AGENTS_PORT=2024
 OPENAI_API_KEY=
 LANGCHAIN_API_KEY=
 LANGSMITH_API_KEY=
 LANGGRAPH_CLOUD_LICENSE_KEY=
 NVIDIA_API_KEY=
 IMAGES_HOST=localhost
-AGENTS_HOST=
-AGENTS_PORT=2024
+NVIDIA_VISION_MODEL=meta/llama-3.2-90b-vision-instruct
+NVIDIA_TEXT_MODEL=meta/llama-3.3-70b-instruct
+TEXT_MODEL_PROVIDER=nvidia
 ```
 
-Normally LANGCHAIN_API_KEY and LANGSMITH_API_KEY have the same value. 
+Normally LANGCHAIN_API_KEY and LANGSMITH_API_KEY have the same value.
 
-AGENTS_HOST is the IP address of the host where you are running docker. It could be the IP address of your PC for instance.
 
 ### .env.lg
 
@@ -77,14 +87,20 @@ MONGO_INITDB_ROOT_USERNAME=admin
 MONGO_INITDB_ROOT_PASSWORD=secret
 MONGO_HOST=localhost
 MONGO_PORT=27017
+
+AGENTS_PORT=2024
+
 OPENAI_API_KEY=
 LANGCHAIN_API_KEY=
 LANGSMITH_API_KEY=
 LANGGRAPH_CLOUD_LICENSE_KEY=
 NVIDIA_API_KEY=
 IMAGES_HOST=localhost
-AGENTS_HOST=localhost
-AGENTS_PORT=2024
+
+NVIDIA_VISION_MODEL=meta/llama-3.2-90b-vision-instruct
+NVIDIA_TEXT_MODEL=meta/llama-3.3-70b-instruct
+TEXT_MODEL_PROVIDER=nvidia
+
 ```
 
 # Launch the mongodb and gradio services
@@ -136,4 +152,13 @@ curl --request POST \
 ```
 
 
+## Scaling the services
+
+One can easily scale this solution using a hierarchical approach, with multiple long context LLM calls. 
+
+![Scaling the Retrieval Solution](assets/hierarchical_approach.png)
+
+The picture above illustrates the hierarchical approach using an example of 1000 documents. These documents are divided into 10 groups, with each containing 100 documents. In the first stage, the LLM generates the top 10 summaries for each group, resulting in a total of 100 best summaries. In the second stage, the LLM selects the top 10 summaries from the 100 summaries. These 10 summaries then lead to the 10 most relevant documents, from which the LLM retrieves an answer to the query. If the answer is derived from an image, a VLM is deployed in this stage to process the visual content.
+
+
 
diff --git a/community/multimodal_retrieval/agent.py b/community/multimodal_retrieval/agent.py
@@ -1,4 +1,6 @@
 import os, json
+import logging
+
 from pathlib import Path
 from datetime import datetime
 from langchain_core.tools import tool
@@ -30,6 +32,17 @@
 from nv_mm_document_qa.chain_full_collection import chain_document_expert
 
 
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    handlers=[
+        logging.FileHandler("../app.log"),
+        logging.StreamHandler()
+    ]
+)
+
+
 # from fciannella_tme_ingest_docs.openai_parse_image import call_openai_api_for_image
 nvidia_ngc_api_key = os.environ["NVIDIA_API_KEY"]
 
@@ -61,30 +74,49 @@ def call_image_processing_api(backend_llm, image_base64, system_template, questi
         presence_penalty=0,
     )
 
+    messages = []
+
     if backend_llm == "nvidia":
         llm = llm_nvidia
+        _question = f"Can you answer this question from the provided image: {question}"
+
+        # print(image_base64)
+
+        human_message = HumanMessage(
+            content=[
+                {"type": "text", "text": _question},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                },
+            ]
+        )
+
+        messages = [human_message]
+
     elif backend_llm == "openai":
         llm = llm_openai
-    else:
-        llm = None
+        system_message = SystemMessage(content=system_template)
 
-    system_message = SystemMessage(content=system_template)
+        _question = f"Can you answer this question from the provided image: {question}"
 
-    _question = f"Can you answer this question from the provided image: {question}"
+        # print(image_base64)
 
-    # print(image_base64)
+        human_message = HumanMessage(
+            content=[
+                {"type": "text", "text": _question},
+                {
+                    "type": "image_url",
+                    "image_url": {"url": f"data:image/png;base64,{image_base64}"},
+                },
+            ]
+        )
 
-    human_message = HumanMessage(
-        content=[
-            {"type": "text", "text": _question},
-            {
-                "type": "image_url",
-                "image_url": {"url": f"data:image/png;base64,{image_base64}"},
-            },
-        ]
-    )
+        messages = [system_message, human_message]
+
+    else:
+        llm = None
 
-    messages = [system_message, human_message]
 
     response = llm.invoke(
         messages
@@ -262,7 +294,9 @@ def __call__(self, state: State, config: RunnableConfig):
                 if result.tool_calls:
                     if result.tool_calls[0]["name"] == "query_document":
                         doc_id = result.tool_calls[0]["args"]["document_id"]
-                        print(f"This is the doc id after querying the document: {doc_id}")
+
+                        logging.info(f"This is the doc id after querying the document: {doc_id}")
+
                         state = {**state, "document_id": doc_id, "collection_name": collection_name, "images_host": images_host}
                 break
         return {"messages": result, "document_id": doc_id}
@@ -284,7 +318,6 @@ def __call__(self, state: State, config: RunnableConfig):
     ]
 )
 
-
 _tools = [
     query_document,
     query_image,
@@ -295,8 +328,6 @@ def __call__(self, state: State, config: RunnableConfig):
 
 part_1_assistant_runnable = primary_assistant_prompt | llm.bind_tools(_tools)
 
-
-
 builder = StateGraph(State)
 
 # Define nodes: these do the work
@@ -330,7 +361,8 @@ def main():
 
     config = {
         "configurable": {
-            "collection_name": "nvidia_eval_blogs_llama32",
+
+            "collection_name": "nvidia-docs",
             "vision_model": "nvidia"
         }
     }

diff --git a/community/multimodal_retrieval/assets/hierarchical_approach.png b/community/multimodal_retrieval/assets/hierarchical_approach.png
diff --git a/community/multimodal_retrieval/docker-compose.yml b/community/multimodal_retrieval/docker-compose.yml
@@ -29,7 +29,11 @@ services:
       MONGO_HOST: mongodb
       MONGO_PORT: 27017
       IMAGES_HOST: ${IMAGES_HOST}
-
+      NVIDIA_VISION_MODEL: ${NVIDIA_VISION_MODEL}
+      NVIDIA_TEXT_MODEL: ${NVIDIA_TEXT_MODEL}
+      TEXT_MODEL_PROVIDER: ${TEXT_MODEL_PROVIDER}
+      AGENTS_PORT: ${AGENTS_PORT}
+      AGENTS_HOST: ${AGENTS_HOST}
 
   gradio-service:
     build:
@@ -52,6 +56,10 @@ services:
       - MONGO_INITDB_ROOT_PASSWORD=${MONGO_INITDB_ROOT_PASSWORD}
       - AGENTS_PORT=${AGENTS_PORT}
       - AGENTS_HOST=${AGENTS_HOST}
+      - NVIDIA_VISION_MODEL=${NVIDIA_VISION_MODEL}
+      - NVIDIA_TEXT_MODEL=${NVIDIA_TEXT_MODEL}
+      - TEXT_MODEL_PROVIDER=${TEXT_MODEL_PROVIDER}
+
 
 volumes:
   mongodb-volume-nv-mm:

diff --git a/community/multimodal_retrieval/gradio/Dockerfile b/community/multimodal_retrieval/gradio/Dockerfile
@@ -9,5 +9,8 @@ RUN pip install -r requirements.txt
 
 COPY . .
 
+RUN apt-get update
+RUN apt-get install -y vim iputils-ping telnet net-tools
+
 # Specify the Gradio script file
 ENTRYPOINT python gradio/gradio_interface.py
diff --git a/community/multimodal_retrieval/gradio/gradio_interface.py b/community/multimodal_retrieval/gradio/gradio_interface.py
@@ -4,12 +4,12 @@
 from pymongo import MongoClient
 from langgraph_sdk import get_client
 from datetime import datetime
+import subprocess
 
 import asyncio  # Import asyncio to run the async function
 
 
 # MongoDB connection setup
-agents_host = os.environ["AGENTS_HOST"]
 images_host = os.environ["IMAGES_HOST"]
 mongodb_user = os.environ["MONGO_INITDB_ROOT_USERNAME"]
 mongodb_password = os.environ["MONGO_INITDB_ROOT_PASSWORD"]
@@ -18,6 +18,28 @@
 db = client['tme_urls_db']  # Replace with your database name
 
 
+
+
+def get_default_gateway():
+    try:
+        # Run the 'route -n' command
+        result = subprocess.run(['route', '-n'], stdout=subprocess.PIPE, text=True)
+        # Parse the output
+        for line in result.stdout.splitlines():
+            parts = line.split()
+            # Look for the line with Destination as '0.0.0.0'
+            if len(parts) > 1 and parts[0] == '0.0.0.0':
+                gateway_ip = parts[1]  # The Gateway IP is the second column
+                return gateway_ip
+        return None
+    except Exception as e:
+        print(f"Error occurred: {e}")
+        return None
+
+
+agents_host = get_default_gateway()
+
+
 questions_list = [
     "What is the Max-scale time-to-train records set on the NVIDIA platform and H100 Tensor Core GPUs in the case of Recommendation (DLRMv2) benchmark?",
     "What is the Max-scale time-to-train records set on the NVIDIA platform and H100 Tensor Core GPUs in the case of Object detection, heavyweight (Mask R-CNN)?",
@@ -280,7 +302,10 @@ def format_statistics(statistics):
 
 async def run_stream_service(collection_name, document_id, question, vision_model):
     # Get the client from langgraph_sdk
-    agents_host = os.environ["AGENTS_HOST"]
+
+    # agents_host = os.environ["AGENTS_HOST"]
+    agents_host = get_default_gateway()
+
     agents_port = os.environ["AGENTS_PORT"]
     client = get_client(url=f"http://{agents_host}:{agents_port}")
     thread = await client.threads.create()
@@ -464,23 +489,12 @@ async def run_and_compute_statistics(collection_name, document_id, question, vis
 
         stream_button = gr.Button("Run Stream Service")
         stream_output_answer = gr.Markdown()  # Remove the label argument
-        # gr.Markdown("---")
-        # stream_output_stats = gr.Markdown()  # Remove the label argument
-
 
         def update_collection_dropdown_stream():
             return gr.update(choices=get_collection_names())  # Ensure get_collection_names() returns a list
 
 
-        # async def run_and_compute_statistics(collection_name, question, vision_model):
-        #     # Return the vision model to the output field for testing
-        #     return f"Vision Model: {vision_model}", ""
-
-
         async def run_and_compute_statistics(collection_name, question, vision_model):
-            # print(f"Collection Name: {collection_name}")
-            # print(f"Question: {question}")
-            # print(f"Vision Model: {vision_model}")
 
             final_answer, run_id = await run_stream_service(collection_name, None, question, vision_model)
 
@@ -521,10 +535,6 @@ async def run_and_compute_statistics(collection_name, question, vision_model):
 
         stream_button = gr.Button("Run QA Agent")
         stream_output_answer = gr.Markdown(value="")  # Remove the label argument
-        # gr.Markdown("---")
-        # stream_output_stats = gr.Markdown(value="")  # Remove the label argument
-
-        # Function to update the dropdown when clicked
 
         def update_collection_dropdown_stream():
             return gr.update(choices=get_collection_names())  # Ensure get_collection_names() returns a list
@@ -534,12 +544,6 @@ async def run_and_compute_statistics(collection_name, question, vision_model):
             print(collection_name, question, vision_model)
             final_answer, run_id = await run_stream_service(collection_name, None, question, vision_model)
 
-            # Introduce a delay of 3 seconds (adjust as needed)
-            await asyncio.sleep(3)
-
-            # Now compute the statistics after the delay
-            statistics = compute_statistics(run_id)
-
             # return final_answer, statistics
             return final_answer
         # When a question is selected, automatically fill the answer box
@@ -550,21 +554,6 @@ async def run_and_compute_statistics(collection_name, question, vision_model):
             outputs=[answer_box, url_box]
         )
 
-        # question_dropdown_stream.change(
-        #     fn=lambda question: update_answer_dropdown(questions_list.index(question)),
-        #     inputs=[question_dropdown_stream],
-        #     outputs=[answer_box]
-        # )
-
-        def update_placeholders():
-            return "Processing your request...", "Computing statistics..."
-
-        stream_button.click(
-            fn=update_placeholders,
-            inputs=[],
-            outputs=[stream_output_answer],
-            queue=False  # Ensure this update happens immediately without waiting
-        )
         # Trigger both the streaming service and stats calculation when the button is clicked
         stream_button.click(
             fn=run_and_compute_statistics,
@@ -582,8 +571,6 @@ def update_placeholders():
         collection_dropdown = gr.Dropdown(label="Select Collection", choices=[], interactive=True)
         document_dropdown = gr.Dropdown(label="Select Document URL", choices=[])
 
-        # Add a dropdown for vision model selection
-        # vision_model_dropdown = gr.Dropdown(label="Select Vision Model", choices=["openai", "nvidia"])
 
         vision_model_dropdown = gr.Dropdown(
             label="Select Vision Model",
@@ -594,7 +581,7 @@ def update_placeholders():
 
         generate_button = gr.Button("Generate SDG QA Pair")
 
-        # _qa_output = gr.Textbox(label="Response")  # Removed the 'label' argument
+
         qa_output = gr.Markdown(label="Response")  # Removed the 'label' argument