From cc332f8595f32b6df987f6e49bd474951b0bb1a6 Mon Sep 17 00:00:00 2001
From: root
 <root@rag-vdb-test-inst-31zltcntbukosvbtuhid8zuf7mz.us-central1-b.c.brevdevprod.internal>
Date: Wed, 3 Sep 2025 19:22:47 +0000
Subject: [PATCH 1/6] First draft of RAG library mode integration

---
 examples/rag_library_mode/pyproject.toml      |  28 +++
 .../src/rag_library_mode/__init__.py          |   0
 .../src/rag_library_mode/configs/config.yml   |  29 +++
 .../docker-compose-ingestor-server.yaml       | 211 ++++++++++++++++++
 .../deploy/docker-compose-rag-server.yaml     | 186 +++++++++++++++
 .../src/rag_library_mode/deploy/vectordb.yaml | 102 +++++++++
 6 files changed, 556 insertions(+)
 create mode 100644 examples/rag_library_mode/pyproject.toml
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/__init__.py
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/configs/config.yml
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml

diff --git a/examples/rag_library_mode/pyproject.toml b/examples/rag_library_mode/pyproject.toml
new file mode 100644
index 000000000..cab960dea
--- /dev/null
+++ b/examples/rag_library_mode/pyproject.toml
@@ -0,0 +1,28 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools >= 64", "setuptools-scm>=8"]
+
+[tool.setuptools_scm]
+# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
+# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
+git_describe_command = "git describe --long --first-parent"
+root = "../.."
+
+[project]
+name = "rag_library_mode"
+dynamic = ["version"]
+dependencies = [
+  "nvidia-nat[langchain]~=1.3",
+]
+requires-python = ">=3.11,<3.13"
+description = "Custom NeMo Agent Toolkit Workflow"
+classifiers = ["Programming Language :: Python"]
+
+[tool.uv.sources]
+nvidia-nat = { path = "../..", editable = true }
+
+[project.entry-points.'nat.plugins']
+rag_library_mode = "rag_library_mode.register"
+
+[project.entry-points.'nat.components']
+rag_library_mode = "rag_library_mode.register"
\ No newline at end of file
diff --git a/examples/rag_library_mode/src/rag_library_mode/__init__.py b/examples/rag_library_mode/src/rag_library_mode/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/rag_library_mode/src/rag_library_mode/configs/config.yml b/examples/rag_library_mode/src/rag_library_mode/configs/config.yml
new file mode 100644
index 000000000..768c99859
--- /dev/null
+++ b/examples/rag_library_mode/src/rag_library_mode/configs/config.yml
@@ -0,0 +1,29 @@
+general:
+  use_uvloop: true
+
+
+functions:
+  rag_tool:
+    _type: rag_library_mode
+    base_url: "http://localhost:19530"
+    vdb_top_k: 10
+    reranker_top_k: 100
+    collection_names: ["cuda_docs"]
+    topic: Retrieve relevant documents from the database relevant to the query
+
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 4096
+    top_p: 1
+
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - rag_tool
+  verbose: true
+  llm_name: nim_llm
diff --git a/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml b/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
new file mode 100644
index 000000000..f13ab4a83
--- /dev/null
+++ b/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
@@ -0,0 +1,211 @@
+services:
+
+  # Main ingestor server which is responsible for ingestion
+  ingestor-server:
+    container_name: ingestor-server
+    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../
+      dockerfile: ./src/nvidia_rag/ingestor_server/Dockerfile
+    # start the server on port 8082 with 4 workers for improved latency on concurrent requests.
+    command: --port 8082 --host 0.0.0.0 --workers 1
+
+    volumes:
+      # Mount the prompt.yaml file to the container, path should be absolute
+      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
+
+    # Common customizations to the pipeline can be controlled using env variables
+    environment:
+      # Path to example directory relative to root
+      EXAMPLE_PATH: 'src/nvidia_rag/ingestor_server'
+
+      # Absolute path to custom prompt.yaml file
+      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
+
+      ##===Vector DB specific configurations===
+      # URL on which vectorstore is hosted
+      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
+      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      # Type of vectordb search to be used
+      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
+      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
+      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
+      # Boolean to control GPU search for milvus vectorstore specific to nvingest
+      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
+      # vectorstore collection name to store embeddings
+      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
+
+      ##===MINIO specific configurations===
+      MINIO_ENDPOINT: "minio:9010"
+      MINIO_ACCESSKEY: "minioadmin"
+      MINIO_SECRETKEY: "minioadmin"
+
+      NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+
+      ##===Embedding Model specific configurations===
+      # url on which embedding model is hosted. If "", Nvidia hosted API is used
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
+      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
+      APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
+
+      ##===NV-Ingest Connection Configurations=======
+      APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
+      APP_NVINGEST_MESSAGECLIENTPORT: ${APP_NVINGEST_MESSAGECLIENTPORT:-7670}
+
+      ##===NV-Ingest Extract Configurations==========
+      APP_NVINGEST_EXTRACTTEXT: ${APP_NVINGEST_EXTRACTTEXT:-True}
+      APP_NVINGEST_EXTRACTINFOGRAPHICS: ${APP_NVINGEST_EXTRACTINFOGRAPHICS:-False}
+      APP_NVINGEST_EXTRACTTABLES: ${APP_NVINGEST_EXTRACTTABLES:-True}
+      APP_NVINGEST_EXTRACTCHARTS: ${APP_NVINGEST_EXTRACTCHARTS:-True}
+      APP_NVINGEST_EXTRACTIMAGES: ${APP_NVINGEST_EXTRACTIMAGES:-False}
+      APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
+      APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
+      APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
+      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretriever_parse, None
+      # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
+      APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
+
+      ##===NV-Ingest Splitting Configurations========
+      APP_NVINGEST_CHUNKSIZE: ${APP_NVINGEST_CHUNKSIZE:-512}
+      APP_NVINGEST_CHUNKOVERLAP: ${APP_NVINGEST_CHUNKOVERLAP:-150}
+      APP_NVINGEST_ENABLEPDFSPLITTER: ${APP_NVINGEST_ENABLEPDFSPLITTER:-True}
+      APP_NVINGEST_SEGMENTAUDIO: ${APP_NVINGEST_SEGMENTAUDIO:-False} # Enable audio segmentation for NV Ingest
+
+      ##===NV-Ingest Caption Model configurations====
+      APP_NVINGEST_CAPTIONMODELNAME: ${APP_NVINGEST_CAPTIONMODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
+      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1
+      APP_NVINGEST_CAPTIONENDPOINTURL: ${APP_NVINGEST_CAPTIONENDPOINTURL:-"http://vlm-ms:8000/v1/chat/completions"}
+
+      # Choose whether to store the extracted content in the vector store for citation support
+      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
+
+      # Choose the summary model to use for document summary
+      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3_3-nemotron-super-49b-v1_5}
+      SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-"nim-llm:8000"}
+      SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-50000}
+      SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-200}
+      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
+      LOGLEVEL: ${LOGLEVEL:-INFO}
+
+      # [Optional] Redis configuration for task status and result storage
+      REDIS_HOST: ${REDIS_HOST:-redis}
+      REDIS_PORT: ${REDIS_PORT:-6379}
+      REDIS_DB: ${REDIS_DB:-0}
+
+      # Bulk upload to MinIO
+      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
+      TEMP_DIR: ${TEMP_DIR:-/tmp-data}
+
+      # NV-Ingest Batch Mode Configurations
+      NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
+      NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
+
+    ports:
+      - "8082:8082"
+    expose:
+      - "8082"
+    shm_size: 5gb
+
+  redis:
+    image: "redis/redis-stack:7.2.0-v18"
+    ports:
+      - "6379:6379"
+
+  nv-ingest-ms-runtime:
+    image: nvcr.io/nvstaging/nim/nv-ingest:25.8.0-RC6
+    cpuset: "0-15"
+    volumes:
+      - ${DATASET_ROOT:-./data}:/workspace/data
+    ports:
+      # HTTP API
+      - "7670:7670"
+      # Simple Broker
+      - "7671:7671"
+    cap_add:
+      - sys_nice
+    environment:
+      # Audio model not used in this RAG version
+      - AUDIO_GRPC_ENDPOINT=audio:50051
+      - AUDIO_INFER_PROTOCOL=grpc
+      - CUDA_VISIBLE_DEVICES=0
+      - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
+      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}}
+      # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
+      - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}}
+      # - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-vlm-embedding-ms:8000/v1}}
+      - INGEST_LOG_LEVEL=WARNING
+      - INGEST_RAY_LOG_LEVEL=PRODUCTION
+      - INGEST_EDGE_BUFFER_SIZE=64
+      - INGEST_DYNAMIC_MEMORY_THRESHOLD=0.8
+      - INGEST_DISABLE_DYNAMIC_SCALING=${INGEST_DISABLE_DYNAMIC_SCALING:-True}
+      - INSTALL_AUDIO_EXTRACTION_DEPS=true
+      # Message client for development
+      #- MESSAGE_CLIENT_HOST=0.0.0.0
+      #- MESSAGE_CLIENT_PORT=7671
+      #- MESSAGE_CLIENT_TYPE=simple # Configure the ingest service to use the simple broker
+      # Message client for production
+      - MESSAGE_CLIENT_HOST=redis
+      - MESSAGE_CLIENT_PORT=6379
+      - MESSAGE_CLIENT_TYPE=redis
+      - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
+      - MRC_IGNORE_NUMA_CHECK=1
+      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=${NEMORETRIEVER_PARSE_HTTP_ENDPOINT:-http://nemoretriever-parse:8000/v1/chat/completions}
+      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=${NEMORETRIEVER_PARSE_INFER_PROTOCOL:-http}
+      - NEMORETRIEVER_PARSE_MODEL_NAME=${NEMORETRIEVER_PARSE_MODEL_NAME:-nvidia/nemoretriever-parse}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NVIDIA_BUILD_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
+      - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
+      # Self-hosted ocr endpoints.
+      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-${PADDLE_GRPC_ENDPOINT:-paddle:8001}}
+      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-${PADDLE_HTTP_ENDPOINT:-http://paddle:8000/v1/infer}}
+      - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-${PADDLE_INFER_PROTOCOL:-grpc}}
+      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-paddle}
+      # build.nvidia.com hosted ocr endpoints.
+      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
+      #- OCR_INFER_PROTOCOL=http
+      - READY_CHECK_ALL_COMPONENTS=False
+      - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
+      # Self-hosted redis endpoints.
+      # build.nvidia.com hosted yolox endpoints.
+      #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2
+      #- YOLOX_INFER_PROTOCOL=http
+      - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
+      - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
+      - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
+      # build.nvidia.com hosted yolox-graphics-elements endpoints.
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
+      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
+      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
+      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
+      # build.nvidia.com hosted  yolox-table-elements endpoints.
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
+      - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
+      - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
+      - YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=${YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL:-grpc}
+      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1/chat/completions
+      - VLM_CAPTION_ENDPOINT=${VLM_CAPTION_ENDPOINT:-http://vlm-ms:8000/v1/chat/completions}
+      - VLM_CAPTION_MODEL_NAME=${VLM_CAPTION_MODEL_NAME:-nvidia/llama-3.1-nemotron-nano-vl-8b-v1}
+      - MODEL_PREDOWNLOAD_PATH=${MODEL_PREDOWNLOAD_PATH:-/workspace/models/}
+    healthcheck:
+      test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
+      interval: 10s
+      timeout: 5s
+      retries: 20
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml b/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
new file mode 100644
index 000000000..334623c04
--- /dev/null
+++ b/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
@@ -0,0 +1,186 @@
+services:
+
+  # Main orchestrator server which stiches together all calls to different services to fulfill the user request
+  rag-server:
+    container_name: rag-server
+    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../
+      dockerfile: src/nvidia_rag/rag_server/Dockerfile
+    # start the server on port 8081 with 8 workers for improved latency on concurrent requests.
+    command: --port 8081 --host 0.0.0.0 --workers 8
+    volumes:
+      # Mount the prompt.yaml file to the container, path should be absolute
+      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
+    # Common customizations to the pipeline can be controlled using env variables
+    environment:
+      # Path to example directory relative to root
+      EXAMPLE_PATH: './nvidia_rag/rag_server'
+
+      # Absolute path to custom prompt.yaml file
+      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
+
+      ##===MINIO specific configurations which is used to store the multimodal base64 content===
+      MINIO_ENDPOINT: "minio:9010"
+      MINIO_ACCESSKEY: "minioadmin"
+      MINIO_SECRETKEY: "minioadmin"
+
+      ##===Vector DB specific configurations===
+      # URL on which vectorstore is hosted
+      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
+      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      # Type of index to be used for vectorstore
+      APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
+      # Type of vectordb search to be used
+      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
+      # vectorstore collection name to store embeddings
+      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
+      APP_RETRIEVER_SCORETHRESHOLD: 0.25
+      # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
+      VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
+
+      ##===LLM Model specific configurations===
+      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # url on which llm model is hosted. If "", Nvidia hosted API is used
+      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
+
+      ##===Query Rewriter Model specific configurations===
+      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
+      # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
+      APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b:8000"}
+
+      ##===Filter Expression Generator Model specific configurations===
+      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
+      APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
+      # enable filter expression generator for natural language to filter expression conversion
+      ENABLE_FILTER_GENERATOR: ${ENABLE_FILTER_GENERATOR:-False}
+
+      ##===Embedding Model specific configurations===
+      # url on which embedding model is hosted. If "", Nvidia hosted API is used
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
+      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
+
+      ##===Reranking Model specific configurations===
+      # url on which ranking model is hosted. If "", Nvidia hosted API is used
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
+      # Default confidence threshold for filtering documents by reranker relevance scores (0.0 to 1.0)
+      RERANKER_CONFIDENCE_THRESHOLD: ${RERANKER_CONFIDENCE_THRESHOLD:-0.0}
+
+      ##===VLM Model specific configurations===
+      ENABLE_VLM_INFERENCE: ${ENABLE_VLM_INFERENCE:-false}
+      # Reasoning gate on VLM response: off by default; enable to mitigate incorrect VLM outputs
+      ENABLE_VLM_RESPONSE_REASONING: ${ENABLE_VLM_RESPONSE_REASONING:-false}
+      # Max images sent to VLM per request (query + context)
+      APP_VLM_MAX_TOTAL_IMAGES: ${APP_VLM_MAX_TOTAL_IMAGES:-4}
+      # Max number of query images to include in VLM input
+      APP_VLM_MAX_QUERY_IMAGES: ${APP_VLM_MAX_QUERY_IMAGES:-1}
+      # Max number of context images to include in VLM input
+      APP_VLM_MAX_CONTEXT_IMAGES: ${APP_VLM_MAX_CONTEXT_IMAGES:-1}
+      APP_VLM_SERVERURL: ${APP_VLM_SERVERURL-"http://vlm-ms:8000/v1"}
+      APP_VLM_MODELNAME: ${APP_VLM_MODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
+
+      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+
+      # Number of document chunks to insert in LLM prompt, used only when ENABLE_RERANKER is set to True
+      APP_RETRIEVER_TOPK: ${APP_RETRIEVER_TOPK:-10}
+
+      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
+      LOGLEVEL: ${LOGLEVEL:-INFO}
+
+      # enable multi-turn conversation in the rag chain - this controls conversation history usage
+      # while doing query rewriting and in LLM prompt
+      ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
+
+      # enable query rewriting for multiturn conversation in the rag chain.
+      # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
+      ENABLE_QUERYREWRITER: ${ENABLE_QUERYREWRITER:-False}
+
+      # Choose whether to enable citations in the response
+      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
+
+      # Choose whether to enable/disable guardrails
+      ENABLE_GUARDRAILS: ${ENABLE_GUARDRAILS:-False}
+
+      # NeMo Guardrails URL when ENABLE_GUARDRAILS is true
+      NEMO_GUARDRAILS_URL: ${NEMO_GUARDRAILS_URL:-nemo-guardrails-microservice:7331}
+
+      # number of last n chat messages to consider from the provided conversation history
+      CONVERSATION_HISTORY: 5
+
+      # Tracing
+      APP_TRACING_ENABLED: "False"
+      # HTTP endpoint
+      APP_TRACING_OTLPHTTPENDPOINT: http://otel-collector:4318/v1/traces
+      # GRPC endpoint
+      APP_TRACING_OTLPGRPCENDPOINT: grpc://otel-collector:4317
+
+      # Choose whether to enable source metadata in document content during generation
+      ENABLE_SOURCE_METADATA: ${ENABLE_SOURCE_METADATA:-true}
+
+      # Whether to filter content within <think></think> tags in model responses
+      FILTER_THINK_TOKENS: ${FILTER_THINK_TOKENS:-true}
+
+      # Whether to enable thinking in the rag chain for llama-3.3-nemotron-super-49b model
+      ENABLE_NEMOTRON_THINKING: ${ENABLE_NEMOTRON_THINKING:-false}
+
+      # enable reflection (context relevance and response groundedness checking) in the rag chain
+      ENABLE_REFLECTION: ${ENABLE_REFLECTION:-false}
+      # Maximum number of context relevance loop iterations
+      MAX_REFLECTION_LOOP: ${MAX_REFLECTION_LOOP:-3}
+      # Minimum relevance score threshold (0-2)
+      CONTEXT_RELEVANCE_THRESHOLD: ${CONTEXT_RELEVANCE_THRESHOLD:-1}
+      # Minimum groundedness score threshold (0-2)
+      RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
+      # reflection llm
+      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # reflection llm server url. If "", Nvidia hosted API is used
+      REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
+      # enable iterative query decomposition
+      ENABLE_QUERY_DECOMPOSITION: ${ENABLE_QUERY_DECOMPOSITION:-false}
+      # maximum recursion depth for iterative query decomposition
+      MAX_RECURSION_DEPTH: ${MAX_RECURSION_DEPTH:-3}
+
+    ports:
+      - "8081:8081"
+    expose:
+      - "8081"
+    shm_size: 5gb
+
+  # Sample UI container which interacts with APIs exposed by rag-server container
+  rag-playground:
+    container_name: rag-playground
+    image: nvcr.io/nvstaging/blueprint/rag-playground:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../frontend
+      dockerfile: ./Dockerfile
+      args:
+        # Environment variables for Vite build
+        VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
+        VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
+        VITE_MILVUS_URL: http://milvus:19530
+    ports:
+      - "8090:3000"
+    expose:
+      - "3000"
+    environment:
+      # Runtime environment variables for Vite
+      VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
+      VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
+      VITE_MILVUS_URL: http://milvus:19530
+    depends_on:
+      - rag-server
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml b/examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
new file mode 100644
index 000000000..ed9bf8403
--- /dev/null
+++ b/examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
@@ -0,0 +1,102 @@
+services:
+
+  # Milvus can be made GPU accelerated by uncommenting the lines as specified below
+  milvus:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:${MILVUS_VERSION:-v2.6.0-gpu} # milvusdb/milvus:v2.6.0 for CPU
+    command: ["milvus", "run", "standalone"]
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9010
+      KNOWHERE_GPU_MEM_POOL_SIZE: 2048;4096
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    # healthcheck:
+    #   test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+    #   interval: 30s
+    #   start_period: 90s
+    #   timeout: 20s
+    #   retries: 3
+    ports:
+      - "19530:19530"
+      - "9091:9091"
+    depends_on:
+      - "etcd"
+      - "minio"
+    # Comment out this section if CPU based image is used and set below env variables to False
+    # export APP_VECTORSTORE_ENABLEGPUSEARCH=False
+    # export APP_VECTORSTORE_ENABLEGPUINDEX=False
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: ["gpu"]
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
+    profiles: ["", "milvus"]
+
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.6.4
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    profiles: ["", "milvus"]
+
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2025-07-23T15-54-02Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "9011:9011"
+      - "9010:9010"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9011" --address ":9010"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    profiles: ["", "milvus", "elasticsearch", "minio"]
+
+  elasticsearch:
+    container_name: elasticsearch
+    image: "docker.elastic.co/elasticsearch/elasticsearch:9.0.3"
+    ports:
+      - 9200:9200
+    volumes:
+      # Run "sudo chown -R 1000:1000 deploy/compose/volumes/elasticsearch/" to fix permissions
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/elasticsearch:/usr/share/elasticsearch/data
+    restart: on-failure
+    environment:
+      - discovery.type=single-node
+      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
+      - xpack.security.enabled=false
+      - xpack.license.self_generated.type=basic
+      - network.host=0.0.0.0
+      - cluster.routing.allocation.disk.threshold_enabled=false
+    hostname: elasticsearch
+    healthcheck:
+      test: ["CMD", "curl", "-s", "-f", "http://localhost:9200/_cat/health"]
+      interval: 10s
+      timeout: 1s
+      retries: 10
+    profiles: ["elasticsearch"]
+
+networks:
+  default:
+    name: nvidia-rag
\ No newline at end of file

From b0f9197d76f3c0786d37320993f382aced9974a1 Mon Sep 17 00:00:00 2001
From: root
 <root@rag-vdb-test-inst-31zltcntbukosvbtuhid8zuf7mz.us-central1-b.c.brevdevprod.internal>
Date: Wed, 3 Sep 2025 19:28:43 +0000
Subject: [PATCH 2/6] First draft of RAG library mode integration

---
 .gitignore                                    |   1 +
 .../rag_library_mode/pyproject.toml           |  28 +++
 .../src/rag_library_mode/__init__.py          |   0
 .../src/rag_library_mode/configs/config.yml   |  29 +++
 .../docker-compose-ingestor-server.yaml       | 211 ++++++++++++++++++
 .../deploy/docker-compose-rag-server.yaml     | 186 +++++++++++++++
 .../src/rag_library_mode/deploy/vectordb.yaml | 102 +++++++++
 .../rag_library_mode_function.py              |  78 +++++++
 .../src/rag_library_mode/register.py          |   4 +
 .../rag_library_mode_function.py              |  78 +++++++
 .../src/rag_library_mode/register.py          |   4 +
 11 files changed, 721 insertions(+)
 create mode 100644 examples/rag_library_mode/rag_library_mode/pyproject.toml
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/__init__.py
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
 create mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
 create mode 100644 examples/rag_library_mode/src/rag_library_mode/register.py

diff --git a/.gitignore b/.gitignore
index 76edf0916..60f358611 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,5 @@
 ###### Place new entries directly below this line! ######
+examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/volumes/*
 
 # auto-generated chainlit stuff
 chainlit.md
diff --git a/examples/rag_library_mode/rag_library_mode/pyproject.toml b/examples/rag_library_mode/rag_library_mode/pyproject.toml
new file mode 100644
index 000000000..cab960dea
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/pyproject.toml
@@ -0,0 +1,28 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools >= 64", "setuptools-scm>=8"]
+
+[tool.setuptools_scm]
+# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
+# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
+git_describe_command = "git describe --long --first-parent"
+root = "../.."
+
+[project]
+name = "rag_library_mode"
+dynamic = ["version"]
+dependencies = [
+  "nvidia-nat[langchain]~=1.3",
+]
+requires-python = ">=3.11,<3.13"
+description = "Custom NeMo Agent Toolkit Workflow"
+classifiers = ["Programming Language :: Python"]
+
+[tool.uv.sources]
+nvidia-nat = { path = "../..", editable = true }
+
+[project.entry-points.'nat.plugins']
+rag_library_mode = "rag_library_mode.register"
+
+[project.entry-points.'nat.components']
+rag_library_mode = "rag_library_mode.register"
\ No newline at end of file
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/__init__.py b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml
new file mode 100644
index 000000000..768c99859
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml
@@ -0,0 +1,29 @@
+general:
+  use_uvloop: true
+
+
+functions:
+  rag_tool:
+    _type: rag_library_mode
+    base_url: "http://localhost:19530"
+    vdb_top_k: 10
+    reranker_top_k: 100
+    collection_names: ["cuda_docs"]
+    topic: Retrieve relevant documents from the database relevant to the query
+
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 4096
+    top_p: 1
+
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - rag_tool
+  verbose: true
+  llm_name: nim_llm
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
new file mode 100644
index 000000000..f13ab4a83
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
@@ -0,0 +1,211 @@
+services:
+
+  # Main ingestor server which is responsible for ingestion
+  ingestor-server:
+    container_name: ingestor-server
+    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../
+      dockerfile: ./src/nvidia_rag/ingestor_server/Dockerfile
+    # start the server on port 8082 with 4 workers for improved latency on concurrent requests.
+    command: --port 8082 --host 0.0.0.0 --workers 1
+
+    volumes:
+      # Mount the prompt.yaml file to the container, path should be absolute
+      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
+
+    # Common customizations to the pipeline can be controlled using env variables
+    environment:
+      # Path to example directory relative to root
+      EXAMPLE_PATH: 'src/nvidia_rag/ingestor_server'
+
+      # Absolute path to custom prompt.yaml file
+      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
+
+      ##===Vector DB specific configurations===
+      # URL on which vectorstore is hosted
+      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
+      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      # Type of vectordb search to be used
+      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
+      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
+      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
+      # Boolean to control GPU search for milvus vectorstore specific to nvingest
+      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
+      # vectorstore collection name to store embeddings
+      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
+
+      ##===MINIO specific configurations===
+      MINIO_ENDPOINT: "minio:9010"
+      MINIO_ACCESSKEY: "minioadmin"
+      MINIO_SECRETKEY: "minioadmin"
+
+      NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+
+      ##===Embedding Model specific configurations===
+      # url on which embedding model is hosted. If "", Nvidia hosted API is used
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
+      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
+      APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
+
+      ##===NV-Ingest Connection Configurations=======
+      APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
+      APP_NVINGEST_MESSAGECLIENTPORT: ${APP_NVINGEST_MESSAGECLIENTPORT:-7670}
+
+      ##===NV-Ingest Extract Configurations==========
+      APP_NVINGEST_EXTRACTTEXT: ${APP_NVINGEST_EXTRACTTEXT:-True}
+      APP_NVINGEST_EXTRACTINFOGRAPHICS: ${APP_NVINGEST_EXTRACTINFOGRAPHICS:-False}
+      APP_NVINGEST_EXTRACTTABLES: ${APP_NVINGEST_EXTRACTTABLES:-True}
+      APP_NVINGEST_EXTRACTCHARTS: ${APP_NVINGEST_EXTRACTCHARTS:-True}
+      APP_NVINGEST_EXTRACTIMAGES: ${APP_NVINGEST_EXTRACTIMAGES:-False}
+      APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
+      APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
+      APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
+      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretriever_parse, None
+      # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
+      APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
+
+      ##===NV-Ingest Splitting Configurations========
+      APP_NVINGEST_CHUNKSIZE: ${APP_NVINGEST_CHUNKSIZE:-512}
+      APP_NVINGEST_CHUNKOVERLAP: ${APP_NVINGEST_CHUNKOVERLAP:-150}
+      APP_NVINGEST_ENABLEPDFSPLITTER: ${APP_NVINGEST_ENABLEPDFSPLITTER:-True}
+      APP_NVINGEST_SEGMENTAUDIO: ${APP_NVINGEST_SEGMENTAUDIO:-False} # Enable audio segmentation for NV Ingest
+
+      ##===NV-Ingest Caption Model configurations====
+      APP_NVINGEST_CAPTIONMODELNAME: ${APP_NVINGEST_CAPTIONMODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
+      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1
+      APP_NVINGEST_CAPTIONENDPOINTURL: ${APP_NVINGEST_CAPTIONENDPOINTURL:-"http://vlm-ms:8000/v1/chat/completions"}
+
+      # Choose whether to store the extracted content in the vector store for citation support
+      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
+
+      # Choose the summary model to use for document summary
+      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3_3-nemotron-super-49b-v1_5}
+      SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-"nim-llm:8000"}
+      SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-50000}
+      SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-200}
+      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
+      LOGLEVEL: ${LOGLEVEL:-INFO}
+
+      # [Optional] Redis configuration for task status and result storage
+      REDIS_HOST: ${REDIS_HOST:-redis}
+      REDIS_PORT: ${REDIS_PORT:-6379}
+      REDIS_DB: ${REDIS_DB:-0}
+
+      # Bulk upload to MinIO
+      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
+      TEMP_DIR: ${TEMP_DIR:-/tmp-data}
+
+      # NV-Ingest Batch Mode Configurations
+      NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
+      NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
+
+    ports:
+      - "8082:8082"
+    expose:
+      - "8082"
+    shm_size: 5gb
+
+  redis:
+    image: "redis/redis-stack:7.2.0-v18"
+    ports:
+      - "6379:6379"
+
+  nv-ingest-ms-runtime:
+    image: nvcr.io/nvstaging/nim/nv-ingest:25.8.0-RC6
+    cpuset: "0-15"
+    volumes:
+      - ${DATASET_ROOT:-./data}:/workspace/data
+    ports:
+      # HTTP API
+      - "7670:7670"
+      # Simple Broker
+      - "7671:7671"
+    cap_add:
+      - sys_nice
+    environment:
+      # Audio model not used in this RAG version
+      - AUDIO_GRPC_ENDPOINT=audio:50051
+      - AUDIO_INFER_PROTOCOL=grpc
+      - CUDA_VISIBLE_DEVICES=0
+      - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
+      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}}
+      # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
+      - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}}
+      # - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-vlm-embedding-ms:8000/v1}}
+      - INGEST_LOG_LEVEL=WARNING
+      - INGEST_RAY_LOG_LEVEL=PRODUCTION
+      - INGEST_EDGE_BUFFER_SIZE=64
+      - INGEST_DYNAMIC_MEMORY_THRESHOLD=0.8
+      - INGEST_DISABLE_DYNAMIC_SCALING=${INGEST_DISABLE_DYNAMIC_SCALING:-True}
+      - INSTALL_AUDIO_EXTRACTION_DEPS=true
+      # Message client for development
+      #- MESSAGE_CLIENT_HOST=0.0.0.0
+      #- MESSAGE_CLIENT_PORT=7671
+      #- MESSAGE_CLIENT_TYPE=simple # Configure the ingest service to use the simple broker
+      # Message client for production
+      - MESSAGE_CLIENT_HOST=redis
+      - MESSAGE_CLIENT_PORT=6379
+      - MESSAGE_CLIENT_TYPE=redis
+      - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
+      - MRC_IGNORE_NUMA_CHECK=1
+      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=${NEMORETRIEVER_PARSE_HTTP_ENDPOINT:-http://nemoretriever-parse:8000/v1/chat/completions}
+      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=${NEMORETRIEVER_PARSE_INFER_PROTOCOL:-http}
+      - NEMORETRIEVER_PARSE_MODEL_NAME=${NEMORETRIEVER_PARSE_MODEL_NAME:-nvidia/nemoretriever-parse}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NVIDIA_BUILD_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
+      - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
+      # Self-hosted ocr endpoints.
+      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-${PADDLE_GRPC_ENDPOINT:-paddle:8001}}
+      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-${PADDLE_HTTP_ENDPOINT:-http://paddle:8000/v1/infer}}
+      - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-${PADDLE_INFER_PROTOCOL:-grpc}}
+      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-paddle}
+      # build.nvidia.com hosted ocr endpoints.
+      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
+      #- OCR_INFER_PROTOCOL=http
+      - READY_CHECK_ALL_COMPONENTS=False
+      - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
+      # Self-hosted redis endpoints.
+      # build.nvidia.com hosted yolox endpoints.
+      #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2
+      #- YOLOX_INFER_PROTOCOL=http
+      - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
+      - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
+      - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
+      # build.nvidia.com hosted yolox-graphics-elements endpoints.
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
+      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
+      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
+      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
+      # build.nvidia.com hosted  yolox-table-elements endpoints.
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
+      - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
+      - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
+      - YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=${YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL:-grpc}
+      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1/chat/completions
+      - VLM_CAPTION_ENDPOINT=${VLM_CAPTION_ENDPOINT:-http://vlm-ms:8000/v1/chat/completions}
+      - VLM_CAPTION_MODEL_NAME=${VLM_CAPTION_MODEL_NAME:-nvidia/llama-3.1-nemotron-nano-vl-8b-v1}
+      - MODEL_PREDOWNLOAD_PATH=${MODEL_PREDOWNLOAD_PATH:-/workspace/models/}
+    healthcheck:
+      test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
+      interval: 10s
+      timeout: 5s
+      retries: 20
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
new file mode 100644
index 000000000..334623c04
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
@@ -0,0 +1,186 @@
+services:
+
+  # Main orchestrator server which stiches together all calls to different services to fulfill the user request
+  rag-server:
+    container_name: rag-server
+    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../
+      dockerfile: src/nvidia_rag/rag_server/Dockerfile
+    # start the server on port 8081 with 8 workers for improved latency on concurrent requests.
+    command: --port 8081 --host 0.0.0.0 --workers 8
+    volumes:
+      # Mount the prompt.yaml file to the container, path should be absolute
+      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
+    # Common customizations to the pipeline can be controlled using env variables
+    environment:
+      # Path to example directory relative to root
+      EXAMPLE_PATH: './nvidia_rag/rag_server'
+
+      # Absolute path to custom prompt.yaml file
+      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
+
+      ##===MINIO specific configurations which is used to store the multimodal base64 content===
+      MINIO_ENDPOINT: "minio:9010"
+      MINIO_ACCESSKEY: "minioadmin"
+      MINIO_SECRETKEY: "minioadmin"
+
+      ##===Vector DB specific configurations===
+      # URL on which vectorstore is hosted
+      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
+      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      # Type of index to be used for vectorstore
+      APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
+      # Type of vectordb search to be used
+      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
+      # vectorstore collection name to store embeddings
+      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
+      APP_RETRIEVER_SCORETHRESHOLD: 0.25
+      # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
+      VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
+
+      ##===LLM Model specific configurations===
+      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # url on which llm model is hosted. If "", Nvidia hosted API is used
+      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
+
+      ##===Query Rewriter Model specific configurations===
+      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
+      # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
+      APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b:8000"}
+
+      ##===Filter Expression Generator Model specific configurations===
+      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
+      APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
+      # enable filter expression generator for natural language to filter expression conversion
+      ENABLE_FILTER_GENERATOR: ${ENABLE_FILTER_GENERATOR:-False}
+
+      ##===Embedding Model specific configurations===
+      # url on which embedding model is hosted. If "", Nvidia hosted API is used
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
+      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
+
+      ##===Reranking Model specific configurations===
+      # url on which ranking model is hosted. If "", Nvidia hosted API is used
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
+      # Default confidence threshold for filtering documents by reranker relevance scores (0.0 to 1.0)
+      RERANKER_CONFIDENCE_THRESHOLD: ${RERANKER_CONFIDENCE_THRESHOLD:-0.0}
+
+      ##===VLM Model specific configurations===
+      ENABLE_VLM_INFERENCE: ${ENABLE_VLM_INFERENCE:-false}
+      # Reasoning gate on VLM response: off by default; enable to mitigate incorrect VLM outputs
+      ENABLE_VLM_RESPONSE_REASONING: ${ENABLE_VLM_RESPONSE_REASONING:-false}
+      # Max images sent to VLM per request (query + context)
+      APP_VLM_MAX_TOTAL_IMAGES: ${APP_VLM_MAX_TOTAL_IMAGES:-4}
+      # Max number of query images to include in VLM input
+      APP_VLM_MAX_QUERY_IMAGES: ${APP_VLM_MAX_QUERY_IMAGES:-1}
+      # Max number of context images to include in VLM input
+      APP_VLM_MAX_CONTEXT_IMAGES: ${APP_VLM_MAX_CONTEXT_IMAGES:-1}
+      APP_VLM_SERVERURL: ${APP_VLM_SERVERURL-"http://vlm-ms:8000/v1"}
+      APP_VLM_MODELNAME: ${APP_VLM_MODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
+
+      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+
+      # Number of document chunks to insert in LLM prompt, used only when ENABLE_RERANKER is set to True
+      APP_RETRIEVER_TOPK: ${APP_RETRIEVER_TOPK:-10}
+
+      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
+      LOGLEVEL: ${LOGLEVEL:-INFO}
+
+      # enable multi-turn conversation in the rag chain - this controls conversation history usage
+      # while doing query rewriting and in LLM prompt
+      ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
+
+      # enable query rewriting for multiturn conversation in the rag chain.
+      # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
+      ENABLE_QUERYREWRITER: ${ENABLE_QUERYREWRITER:-False}
+
+      # Choose whether to enable citations in the response
+      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
+
+      # Choose whether to enable/disable guardrails
+      ENABLE_GUARDRAILS: ${ENABLE_GUARDRAILS:-False}
+
+      # NeMo Guardrails URL when ENABLE_GUARDRAILS is true
+      NEMO_GUARDRAILS_URL: ${NEMO_GUARDRAILS_URL:-nemo-guardrails-microservice:7331}
+
+      # number of last n chat messages to consider from the provided conversation history
+      CONVERSATION_HISTORY: 5
+
+      # Tracing
+      APP_TRACING_ENABLED: "False"
+      # HTTP endpoint
+      APP_TRACING_OTLPHTTPENDPOINT: http://otel-collector:4318/v1/traces
+      # GRPC endpoint
+      APP_TRACING_OTLPGRPCENDPOINT: grpc://otel-collector:4317
+
+      # Choose whether to enable source metadata in document content during generation
+      ENABLE_SOURCE_METADATA: ${ENABLE_SOURCE_METADATA:-true}
+
+      # Whether to filter content within <think></think> tags in model responses
+      FILTER_THINK_TOKENS: ${FILTER_THINK_TOKENS:-true}
+
+      # Whether to enable thinking in the rag chain for llama-3.3-nemotron-super-49b model
+      ENABLE_NEMOTRON_THINKING: ${ENABLE_NEMOTRON_THINKING:-false}
+
+      # enable reflection (context relevance and response groundedness checking) in the rag chain
+      ENABLE_REFLECTION: ${ENABLE_REFLECTION:-false}
+      # Maximum number of context relevance loop iterations
+      MAX_REFLECTION_LOOP: ${MAX_REFLECTION_LOOP:-3}
+      # Minimum relevance score threshold (0-2)
+      CONTEXT_RELEVANCE_THRESHOLD: ${CONTEXT_RELEVANCE_THRESHOLD:-1}
+      # Minimum groundedness score threshold (0-2)
+      RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
+      # reflection llm
+      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # reflection llm server url. If "", Nvidia hosted API is used
+      REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
+      # enable iterative query decomposition
+      ENABLE_QUERY_DECOMPOSITION: ${ENABLE_QUERY_DECOMPOSITION:-false}
+      # maximum recursion depth for iterative query decomposition
+      MAX_RECURSION_DEPTH: ${MAX_RECURSION_DEPTH:-3}
+
+    ports:
+      - "8081:8081"
+    expose:
+      - "8081"
+    shm_size: 5gb
+
+  # Sample UI container which interacts with APIs exposed by rag-server container
+  rag-playground:
+    container_name: rag-playground
+    image: nvcr.io/nvstaging/blueprint/rag-playground:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../frontend
+      dockerfile: ./Dockerfile
+      args:
+        # Environment variables for Vite build
+        VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
+        VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
+        VITE_MILVUS_URL: http://milvus:19530
+    ports:
+      - "8090:3000"
+    expose:
+      - "3000"
+    environment:
+      # Runtime environment variables for Vite
+      VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
+      VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
+      VITE_MILVUS_URL: http://milvus:19530
+    depends_on:
+      - rag-server
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
new file mode 100644
index 000000000..ed9bf8403
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
@@ -0,0 +1,102 @@
+services:
+
+  # Milvus can be made GPU accelerated by uncommenting the lines as specified below
+  milvus:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:${MILVUS_VERSION:-v2.6.0-gpu} # milvusdb/milvus:v2.6.0 for CPU
+    command: ["milvus", "run", "standalone"]
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9010
+      KNOWHERE_GPU_MEM_POOL_SIZE: 2048;4096
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    # healthcheck:
+    #   test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+    #   interval: 30s
+    #   start_period: 90s
+    #   timeout: 20s
+    #   retries: 3
+    ports:
+      - "19530:19530"
+      - "9091:9091"
+    depends_on:
+      - "etcd"
+      - "minio"
+    # Comment out this section if CPU based image is used and set below env variables to False
+    # export APP_VECTORSTORE_ENABLEGPUSEARCH=False
+    # export APP_VECTORSTORE_ENABLEGPUINDEX=False
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: ["gpu"]
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
+    profiles: ["", "milvus"]
+
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.6.4
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    profiles: ["", "milvus"]
+
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2025-07-23T15-54-02Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "9011:9011"
+      - "9010:9010"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9011" --address ":9010"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    profiles: ["", "milvus", "elasticsearch", "minio"]
+
+  elasticsearch:
+    container_name: elasticsearch
+    image: "docker.elastic.co/elasticsearch/elasticsearch:9.0.3"
+    ports:
+      - 9200:9200
+    volumes:
+      # Run "sudo chown -R 1000:1000 deploy/compose/volumes/elasticsearch/" to fix permissions
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/elasticsearch:/usr/share/elasticsearch/data
+    restart: on-failure
+    environment:
+      - discovery.type=single-node
+      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
+      - xpack.security.enabled=false
+      - xpack.license.self_generated.type=basic
+      - network.host=0.0.0.0
+      - cluster.routing.allocation.disk.threshold_enabled=false
+    hostname: elasticsearch
+    healthcheck:
+      test: ["CMD", "curl", "-s", "-f", "http://localhost:9200/_cat/health"]
+      interval: 10s
+      timeout: 1s
+      retries: 10
+    profiles: ["elasticsearch"]
+
+networks:
+  default:
+    name: nvidia-rag
\ No newline at end of file
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
new file mode 100644
index 000000000..5f374f4ab
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
@@ -0,0 +1,78 @@
+import logging
+
+from pydantic import Field
+
+from nat.builder.builder import Builder
+from nat.builder.function_info import FunctionInfo
+from nat.cli.register_workflow import register_function
+from nat.data_models.function import FunctionBaseConfig
+
+from nvidia_rag import NvidiaRAG, NvidiaRAGIngestor
+
+import json
+import base64
+from IPython.display import display, Image, Markdown
+
+
+logger = logging.getLogger(__name__)
+
+
+class RagLibraryModeFunctionConfig(FunctionBaseConfig, name="rag_library_mode"):
+    """
+    This tool retrieves relevant documents for a given user query. The input query is mapped to the most appropriate
+    Milvus collection database. This will return relevant documents from the selected collection.
+    """
+    base_url: str = Field(description="The base url used to connect to the milvus database.")
+    reranker_top_k: int = Field(default=100, description="The number of results to return from the milvus database.")
+    vdb_top_k: int = Field(default=10, description="The number of results to return from the milvus database.")
+    collection_names: list = Field(default=["cuda_docs"],
+                                   description="The list of available collection names.")
+    
+
+
+@register_function(config_type=RagLibraryModeFunctionConfig)
+async def rag_library_mode_function(
+    config: RagLibraryModeFunctionConfig, builder: Builder
+):
+
+    def parse_search_citations(citations):
+
+        parsed_docs = []
+        
+        for idx, citation in enumerate(citations.results):
+        # If using pydantic models, citation fields may be attributes, not dict keys
+            content = getattr(citation, 'content', '')
+            doc_name = getattr(citation, 'document_name', f'Citation {idx+1}')
+            parsed_document = f'<Document source="{doc_name}"/>\n{content}\n</Document>'
+            parsed_docs.append(parsed_document)
+
+            # combine parsed documents into a single string
+            internal_search_docs = "\n\n---\n\n".join(parsed_docs)
+            return internal_search_docs
+
+    async def _response_fn(query: str) -> str:
+        # Process the input_message and generate output
+
+        rag = NvidiaRAG()
+        ingestor = NvidiaRAGIngestor()
+
+        # Just to debug
+        response = ingestor.get_documents(
+        collection_name=config.collection_names,
+        vdb_endpoint=config.base_url,
+        )
+        logger.info(f"***** {response}")
+        
+        return parse_search_citations(rag.search(
+            query=f"{query}",
+            collection_names=config.collection_names,
+            reranker_top_k=config.reranker_top_k,
+            vdb_top_k=config.vdb_top_k,
+        ))  
+
+    try:
+        yield FunctionInfo.create(single_fn=_response_fn)
+    except GeneratorExit:
+        logger.warning("Function exited early!")
+    finally:
+        logger.info("Cleaning up rag_library_mode workflow.")
\ No newline at end of file
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py
new file mode 100644
index 000000000..cbdb8f3aa
--- /dev/null
+++ b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+# Import any tools which need to be automatically registered here
+from rag_library_mode import rag_library_mode_function
\ No newline at end of file
diff --git a/examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py b/examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
new file mode 100644
index 000000000..5f374f4ab
--- /dev/null
+++ b/examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
@@ -0,0 +1,78 @@
+import logging
+
+from pydantic import Field
+
+from nat.builder.builder import Builder
+from nat.builder.function_info import FunctionInfo
+from nat.cli.register_workflow import register_function
+from nat.data_models.function import FunctionBaseConfig
+
+from nvidia_rag import NvidiaRAG, NvidiaRAGIngestor
+
+import json
+import base64
+from IPython.display import display, Image, Markdown
+
+
+logger = logging.getLogger(__name__)
+
+
+class RagLibraryModeFunctionConfig(FunctionBaseConfig, name="rag_library_mode"):
+    """
+    This tool retrieves relevant documents for a given user query. The input query is mapped to the most appropriate
+    Milvus collection database. This will return relevant documents from the selected collection.
+    """
+    base_url: str = Field(description="The base url used to connect to the milvus database.")
+    reranker_top_k: int = Field(default=100, description="The number of results to return from the milvus database.")
+    vdb_top_k: int = Field(default=10, description="The number of results to return from the milvus database.")
+    collection_names: list = Field(default=["cuda_docs"],
+                                   description="The list of available collection names.")
+    
+
+
+@register_function(config_type=RagLibraryModeFunctionConfig)
+async def rag_library_mode_function(
+    config: RagLibraryModeFunctionConfig, builder: Builder
+):
+
+    def parse_search_citations(citations):
+
+        parsed_docs = []
+        
+        for idx, citation in enumerate(citations.results):
+        # If using pydantic models, citation fields may be attributes, not dict keys
+            content = getattr(citation, 'content', '')
+            doc_name = getattr(citation, 'document_name', f'Citation {idx+1}')
+            parsed_document = f'<Document source="{doc_name}"/>\n{content}\n</Document>'
+            parsed_docs.append(parsed_document)
+
+            # combine parsed documents into a single string
+            internal_search_docs = "\n\n---\n\n".join(parsed_docs)
+            return internal_search_docs
+
+    async def _response_fn(query: str) -> str:
+        # Process the input_message and generate output
+
+        rag = NvidiaRAG()
+        ingestor = NvidiaRAGIngestor()
+
+        # Just to debug
+        response = ingestor.get_documents(
+        collection_name=config.collection_names,
+        vdb_endpoint=config.base_url,
+        )
+        logger.info(f"***** {response}")
+        
+        return parse_search_citations(rag.search(
+            query=f"{query}",
+            collection_names=config.collection_names,
+            reranker_top_k=config.reranker_top_k,
+            vdb_top_k=config.vdb_top_k,
+        ))  
+
+    try:
+        yield FunctionInfo.create(single_fn=_response_fn)
+    except GeneratorExit:
+        logger.warning("Function exited early!")
+    finally:
+        logger.info("Cleaning up rag_library_mode workflow.")
\ No newline at end of file
diff --git a/examples/rag_library_mode/src/rag_library_mode/register.py b/examples/rag_library_mode/src/rag_library_mode/register.py
new file mode 100644
index 000000000..cbdb8f3aa
--- /dev/null
+++ b/examples/rag_library_mode/src/rag_library_mode/register.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+# Import any tools which need to be automatically registered here
+from rag_library_mode import rag_library_mode_function
\ No newline at end of file

From 3375c41236b2b8b60c7f5ad7ca3397b5e692e357 Mon Sep 17 00:00:00 2001
From: Narimane Hennouni <nhennouni@nhennouni-mlt.client.nvidia.com>
Date: Wed, 24 Sep 2025 14:40:36 -0700
Subject: [PATCH 3/6] rag library mode with unit tests and readme

---
 examples/RAG/library_rag/README.md            | 372 ++++++++++++++++++
 examples/RAG/library_rag/configs/config.yml   |  31 ++
 examples/RAG/library_rag/data/cuda.txt        |   3 +
 .../docker-compose-ingestor-server.yaml       | 211 ++++++++++
 .../deploy/docker-compose-rag-server.yaml     | 186 +++++++++
 examples/RAG/library_rag/deploy/nims.yaml     | 348 ++++++++++++++++
 examples/RAG/library_rag/deploy/vectordb.yaml | 102 +++++
 examples/RAG/library_rag/pyproject.toml       |  25 ++
 .../library_rag/src/library_rag/__init__.py   |   0
 .../src/library_rag/configs/config.yml        |  31 ++
 .../src/library_rag/library_rag_function.py   |  82 ++++
 .../library_rag/src/library_rag/register.py   |   4 +
 examples/RAG/library_rag/tests/__init__.py    |   0
 examples/RAG/library_rag/tests/conftest.py    |  78 ++++
 .../tests/test_configs/test_config.yml        |  28 ++
 .../RAG/library_rag/tests/test_integration.py | 112 ++++++
 examples/local_rag/configs                    |   1 +
 examples/local_rag/data                       |   1 +
 examples/local_rag/pyproject.toml             |  25 ++
 examples/local_rag/src/local_rag/__init__.py  |   0
 .../src/local_rag/configs/config.yml          |  25 ++
 .../src/local_rag/local_rag_function.py       |  87 ++++
 examples/local_rag/src/local_rag/register.py  |   4 +
 .../local_rag/src/local_rag/response.json     |  55 +++
 24 files changed, 1811 insertions(+)
 create mode 100644 examples/RAG/library_rag/README.md
 create mode 100644 examples/RAG/library_rag/configs/config.yml
 create mode 100644 examples/RAG/library_rag/data/cuda.txt
 create mode 100644 examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml
 create mode 100644 examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml
 create mode 100644 examples/RAG/library_rag/deploy/nims.yaml
 create mode 100644 examples/RAG/library_rag/deploy/vectordb.yaml
 create mode 100644 examples/RAG/library_rag/pyproject.toml
 create mode 100644 examples/RAG/library_rag/src/library_rag/__init__.py
 create mode 100644 examples/RAG/library_rag/src/library_rag/configs/config.yml
 create mode 100644 examples/RAG/library_rag/src/library_rag/library_rag_function.py
 create mode 100644 examples/RAG/library_rag/src/library_rag/register.py
 create mode 100644 examples/RAG/library_rag/tests/__init__.py
 create mode 100644 examples/RAG/library_rag/tests/conftest.py
 create mode 100644 examples/RAG/library_rag/tests/test_configs/test_config.yml
 create mode 100644 examples/RAG/library_rag/tests/test_integration.py
 create mode 120000 examples/local_rag/configs
 create mode 120000 examples/local_rag/data
 create mode 100644 examples/local_rag/pyproject.toml
 create mode 100644 examples/local_rag/src/local_rag/__init__.py
 create mode 100644 examples/local_rag/src/local_rag/configs/config.yml
 create mode 100644 examples/local_rag/src/local_rag/local_rag_function.py
 create mode 100644 examples/local_rag/src/local_rag/register.py
 create mode 100644 examples/local_rag/src/local_rag/response.json

diff --git a/examples/RAG/library_rag/README.md b/examples/RAG/library_rag/README.md
new file mode 100644
index 000000000..c961f9652
--- /dev/null
+++ b/examples/RAG/library_rag/README.md
@@ -0,0 +1,372 @@
+# NVIDIA RAG Python Package Usage Guide
+
+This guide demonstrates how to use a NAT agent with the NVIDIA RAG Python client as a tool.
+## Table of Contents
+
+- [Installation](#installation)
+- [Setup Dependencies](#setup-dependencies)
+- [API Usage Examples](#api-usage-examples)
+- [Collection Management](#collection-management)
+- [Document Operations](#document-operations)
+- [RAG Queries](#rag-queries)
+- [Search Operations](#search-operations)
+- [Advanced Features](#advanced-features)
+- [Cleanup Operations](#cleanup-operations)
+
+## Installation
+
+> **Note**: Python version **3.12 or higher** is supported.
+
+### Prerequisites
+
+1. **Install Python >= 3.12 and development headers:**
+   ```bash
+   sudo add-apt-repository ppa:deadsnakes/ppa
+   sudo apt update
+   sudo apt install python3.12
+   sudo apt-get install python3.12-dev
+   ```
+
+2. **Install uv:**
+   Follow instructions from [https://docs.astral.sh/uv/getting-started/installation/](https://docs.astral.sh/uv/getting-started/installation/)
+
+3. **Create and activate virtual environment:**
+   ```bash
+   # Create virtual environment
+   uv venv --python=python3.12
+   
+   # Activate virtual environment
+   source .venv/bin/activate
+   ```
+
+### Installation 
+
+```bash
+uv pip install nvidia-rag[all]
+```
+
+### Verify Installation
+
+Check that the package is installed in your virtual environment:
+
+```bash
+uv pip show nvidia_rag | grep Location
+```
+
+The location should be inside your virtual environment at: `<workspace_path>/rag/.venv/lib/python3.12/site-packages`
+
+
+## Setup Dependencies
+
+### Prerequisites
+
+Fulfill the [prerequisites](../docs/quickstart.md#prerequisites) to setup Docker on your system.
+
+### 1. Configure API Key
+
+First, obtain an NGC API key by following the steps [here](../docs/quickstart.md#obtain-an-api-key).
+
+```python
+import os
+from getpass import getpass
+from dotenv import load_dotenv
+
+# Set your NGC API key
+if not os.environ.get("NGC_API_KEY", "").startswith("nvapi-"):
+    candidate_api_key = getpass("NVAPI Key (starts with nvapi-): ")
+    assert candidate_api_key.startswith("nvapi-"), f"{candidate_api_key[:5]}... is not a valid key"
+    os.environ["NGC_API_KEY"] = candidate_api_key
+```
+
+### 2. Docker Login
+
+```bash
+echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
+```
+
+### 3. Load Default Configuration
+
+```python
+load_dotenv(dotenv_path=".env_library", override=True)
+```
+
+> **💡 Tip:** Override default configurations using `os.environ` in your code. Reimport the `nvidia_rag` package and restart the Nvidia Ingest runtime for changes to take effect.
+
+### 4. Setup Milvus Vector Database
+
+Configure GPU device (default uses GPU indexing):
+
+```python
+os.environ["VECTORSTORE_GPU_DEVICE_ID"] = "0"
+```
+
+> **Note:** For CPU-only Milvus, follow instructions in [milvus-configuration.md](../docs/milvus-configuration.md).
+
+Start Milvus:
+```bash
+docker compose -f ../deploy/compose/vectordb.yaml up -d
+```
+
+### 5. Setup NIMs (Neural Inference Microservices)
+
+Choose either on-premises or cloud-hosted models:
+
+#### Option 1: On-Premises Models
+
+Ensure you meet the [hardware requirements](../README.md#hardware-requirements). Default configuration requires 2xH100.
+
+```bash
+# Create model cache directory
+mkdir -p ~/.cache/model-cache
+```
+
+```python
+# Configure model directory
+os.environ["MODEL_DIRECTORY"] = os.path.expanduser("~/.cache/model-cache")
+
+# Configure GPU IDs for microservices
+os.environ["EMBEDDING_MS_GPU_ID"] = "0"
+os.environ["RANKING_MS_GPU_ID"] = "0" 
+os.environ["YOLOX_MS_GPU_ID"] = "0"
+os.environ["YOLOX_GRAPHICS_MS_GPU_ID"] = "0"
+os.environ["YOLOX_TABLE_MS_GPU_ID"] = "0"
+os.environ["OCR_MS_GPU_ID"] = "0"
+os.environ["LLM_MS_GPU_ID"] = "1"
+```
+
+Deploy NIMs (may take time for model downloads):
+```bash
+USERID=$(id -u) docker compose -f ../deploy/compose/nims.yaml up -d
+```
+
+Monitor container status:
+```bash
+docker ps
+```
+
+Ensure all containers are running and healthy:
+- nemoretriever-ranking-ms (healthy)
+- compose-page-elements-1
+- compose-paddle-1  
+- compose-graphic-elements-1
+- compose-table-structure-1
+- nemoretriever-embedding-ms (healthy)
+- nim-llm-ms (healthy)
+
+#### Option 2: NVIDIA Cloud Models
+
+```python
+os.environ["APP_LLM_MODELNAME"] = "nvidia/llama-3_3-nemotron-super-49b-v1_5"
+os.environ["APP_EMBEDDINGS_MODELNAME"] = "nvidia/llama-3.2-nv-embedqa-1b-v2"
+os.environ["APP_RANKING_MODELNAME"] = "nvidia/llama-3.2-nv-rerankqa-1b-v2"
+os.environ["APP_EMBEDDINGS_SERVERURL"] = ""
+os.environ["APP_LLM_SERVERURL"] = ""
+os.environ["APP_RANKING_SERVERURL"] = "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1"
+os.environ["EMBEDDING_NIM_ENDPOINT"] = "https://integrate.api.nvidia.com/v1"
+os.environ["OCR_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
+os.environ["OCR_INFER_PROTOCOL"] = "http"
+os.environ["YOLOX_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
+os.environ["YOLOX_INFER_PROTOCOL"] = "http"
+os.environ["YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1"
+os.environ["YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL"] = "http"
+os.environ["YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
+os.environ["YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL"] = "http"
+```
+
+### 6. Setup NVIDIA Ingest Runtime
+
+```bash
+docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up nv-ingest-ms-runtime redis -d
+```
+
+Open the RAG Playground at localhost:3080, create a new collection and save it. or you can use the API for that, see API Usage examples below:
+
+## API Usage Examples
+
+### Setup Logging
+
+```python
+import logging
+
+LOGLEVEL = logging.WARNING  # Set to INFO, DEBUG, WARNING or ERROR
+logging.basicConfig(level=LOGLEVEL)
+
+for name in logging.root.manager.loggerDict:
+    if name == "nvidia_rag" or name.startswith("nvidia_rag."):
+        logging.getLogger(name).setLevel(LOGLEVEL)
+    if name == "nv_ingest_client" or name.startswith("nv_ingest_client."):
+        logging.getLogger(name).setLevel(LOGLEVEL)
+```
+
+### Import Packages
+
+```python
+from nvidia_rag import NvidiaRAG, NvidiaRAGIngestor
+
+rag = NvidiaRAG()
+ingestor = NvidiaRAGIngestor()
+```
+
+## Collection Management
+
+### Create a New Collection
+
+```python
+response = ingestor.create_collection(
+    collection_name="test_library",
+    vdb_endpoint="http://localhost:19530",
+    # Optional: Create collection with metadata schema
+    # metadata_schema = [
+    #     {
+    #         "name": "meta_field_1",
+    #         "type": "string", 
+    #         "description": "Description field for the document"
+    #     }
+    # ]
+)
+print(response)
+```
+
+### List All Collections
+
+```python
+response = ingestor.get_collections(vdb_endpoint="http://localhost:19530")
+print(response)
+```
+
+### Delete Collections
+
+```python
+response = ingestor.delete_collections(
+    vdb_endpoint="http://localhost:19530", 
+    collection_names=["test_library"]
+)
+print(response)
+```
+
+## Document Operations
+
+### Upload Documents
+
+```python
+response = await ingestor.upload_documents(
+    collection_name="test_library",
+    vdb_endpoint="http://localhost:19530",
+    blocking=False,
+    split_options={"chunk_size": 512, "chunk_overlap": 150},
+    filepaths=[
+        "../data/multimodal/woods_frost.docx",
+        "../data/multimodal/multimodal_test.pdf",
+    ],
+    generate_summary=False,
+    # Optional: Add custom metadata
+    # custom_metadata=[
+    #     {
+    #         "filename": "multimodal_test.pdf",
+    #         "metadata": {"meta_field_1": "multimodal document 1"}
+    #     },
+    #     {
+    #         "filename": "woods_frost.docx", 
+    #         "metadata": {"meta_field_1": "multimodal document 2"}
+    #     }
+    # ]
+)
+print(response)
+```
+
+### Check Upload Status
+
+```python
+response = await ingestor.status(task_id="YOUR_TASK_ID_HERE")
+print(response)
+```
+
+### Update Documents
+
+```python
+response = await ingestor.update_documents(
+    collection_name="test_library",
+    vdb_endpoint="http://localhost:19530", 
+    blocking=False,
+    filepaths=["../data/multimodal/woods_frost.docx"],
+    generate_summary=False,
+)
+print(response)
+```
+
+### List Documents in Collection
+
+```python
+response = ingestor.get_documents(
+    collection_name="test_library",
+    vdb_endpoint="http://localhost:19530",
+)
+print(response)
+```
+
+### Delete Documents
+
+```python
+response = ingestor.delete_documents(
+    collection_name="test_library",
+    document_names=["../data/multimodal/multimodal_test.pdf"],
+    vdb_endpoint="http://localhost:19530",
+)
+print(response)
+```
+
+
+#### Configure Your Agent
+
+Configure your Agent to use the Milvus collections for RAG. We have pre-configured a configuration file for you in `examples/RAG/simple_rag/configs/milvus_rag_config.yml`. You can modify this file to point to your Milvus instance and collections or add tools to your agent. The agent, by default, is a `tool_calling` agent that can be used to interact with the retriever component. The configuration file is shown below. You can also modify your agent to be another one of the NeMo Agent toolkit pre-built agent implementations such as the `react_agent`
+
+    ```yaml
+    general:
+  use_uvloop: true
+
+
+functions:
+  library_rag_tool:
+    _type: library_rag
+    base_url: "http://localhost:8081"
+    reranker_top_k: 2
+    vdb_top_k: 10
+    vdb_endpoint: "http://milvus:19530"
+    collection_names: ["cuda"]
+    enable_query_rewriting: True
+    enable_reranker: True
+
+    #description: Retrieve documents given the input query
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 4096
+    top_p: 1
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - library_rag_tool
+  llm_name: nim_llm
+  verbose: true
+    ```
+
+    If you have a different Milvus instance or collection names, you can modify the `vdb_url` and the `collection_names` in the config file to point to your instance and collections. 
+    You can also modify the retrieval parameters like `vdb_top_k`, ...
+    You can also add additional functions as tools for your agent in the `functions` section.
+
+#### Run the Workflow
+
+```bash
+nat run --config_file examples/RAG/library_rag/configs/config.yml --input "How do I install CUDA"
+```
+
+The expected workflow result of running the above command is:
+```console
+['To install CUDA, you typically need to: \n1. Verify you have a CUDA-capable GPU and a supported version of your operating system.\n2. Download the NVIDIA CUDA Toolkit from the official NVIDIA website.\n3. Choose an installation method, such as a local repository installation or a network repository installation, depending on your system.\n4. Follow the specific instructions for your operating system, which may include installing local repository packages, enabling network repositories, or running installer scripts.\n5. Reboot your system and perform post-installation actions, such as setting up your environment and verifying the installation by running sample projects. \n\nPlease refer to the official NVIDIA CUDA documentation for detailed instructions tailored to your specific operating system and distribution.']
+
+
+
diff --git a/examples/RAG/library_rag/configs/config.yml b/examples/RAG/library_rag/configs/config.yml
new file mode 100644
index 000000000..d4bd4351b
--- /dev/null
+++ b/examples/RAG/library_rag/configs/config.yml
@@ -0,0 +1,31 @@
+general:
+  use_uvloop: true
+
+
+functions:
+  library_rag_tool:
+    _type: library_rag
+    base_url: "http://localhost:8081"
+    reranker_top_k: 2
+    vdb_top_k: 10
+    vdb_endpoint: "http://milvus:19530"
+    collection_names: ["cuda"]
+    enable_query_rewriting: True
+    enable_reranker: True
+
+    #description: Retrieve documents given the input query
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 4096
+    top_p: 1
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - library_rag_tool
+  llm_name: nim_llm
+  verbose: true
\ No newline at end of file
diff --git a/examples/RAG/library_rag/data/cuda.txt b/examples/RAG/library_rag/data/cuda.txt
new file mode 100644
index 000000000..e2b285f57
--- /dev/null
+++ b/examples/RAG/library_rag/data/cuda.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c75e2fd240e1419b246d89b2fa4e586273c42e96f1a42550a8b06a0abd79024
+size 1653
diff --git a/examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml b/examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml
new file mode 100644
index 000000000..f13ab4a83
--- /dev/null
+++ b/examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml
@@ -0,0 +1,211 @@
+services:
+
+  # Main ingestor server which is responsible for ingestion
+  ingestor-server:
+    container_name: ingestor-server
+    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../
+      dockerfile: ./src/nvidia_rag/ingestor_server/Dockerfile
+    # start the server on port 8082 with 4 workers for improved latency on concurrent requests.
+    command: --port 8082 --host 0.0.0.0 --workers 1
+
+    volumes:
+      # Mount the prompt.yaml file to the container, path should be absolute
+      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
+
+    # Common customizations to the pipeline can be controlled using env variables
+    environment:
+      # Path to example directory relative to root
+      EXAMPLE_PATH: 'src/nvidia_rag/ingestor_server'
+
+      # Absolute path to custom prompt.yaml file
+      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
+
+      ##===Vector DB specific configurations===
+      # URL on which vectorstore is hosted
+      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
+      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      # Type of vectordb search to be used
+      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
+      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
+      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
+      # Boolean to control GPU search for milvus vectorstore specific to nvingest
+      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
+      # vectorstore collection name to store embeddings
+      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
+
+      ##===MINIO specific configurations===
+      MINIO_ENDPOINT: "minio:9010"
+      MINIO_ACCESSKEY: "minioadmin"
+      MINIO_SECRETKEY: "minioadmin"
+
+      NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+
+      ##===Embedding Model specific configurations===
+      # url on which embedding model is hosted. If "", Nvidia hosted API is used
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
+      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
+      APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
+
+      ##===NV-Ingest Connection Configurations=======
+      APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
+      APP_NVINGEST_MESSAGECLIENTPORT: ${APP_NVINGEST_MESSAGECLIENTPORT:-7670}
+
+      ##===NV-Ingest Extract Configurations==========
+      APP_NVINGEST_EXTRACTTEXT: ${APP_NVINGEST_EXTRACTTEXT:-True}
+      APP_NVINGEST_EXTRACTINFOGRAPHICS: ${APP_NVINGEST_EXTRACTINFOGRAPHICS:-False}
+      APP_NVINGEST_EXTRACTTABLES: ${APP_NVINGEST_EXTRACTTABLES:-True}
+      APP_NVINGEST_EXTRACTCHARTS: ${APP_NVINGEST_EXTRACTCHARTS:-True}
+      APP_NVINGEST_EXTRACTIMAGES: ${APP_NVINGEST_EXTRACTIMAGES:-False}
+      APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
+      APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
+      APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
+      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretriever_parse, None
+      # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
+      APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
+
+      ##===NV-Ingest Splitting Configurations========
+      APP_NVINGEST_CHUNKSIZE: ${APP_NVINGEST_CHUNKSIZE:-512}
+      APP_NVINGEST_CHUNKOVERLAP: ${APP_NVINGEST_CHUNKOVERLAP:-150}
+      APP_NVINGEST_ENABLEPDFSPLITTER: ${APP_NVINGEST_ENABLEPDFSPLITTER:-True}
+      APP_NVINGEST_SEGMENTAUDIO: ${APP_NVINGEST_SEGMENTAUDIO:-False} # Enable audio segmentation for NV Ingest
+
+      ##===NV-Ingest Caption Model configurations====
+      APP_NVINGEST_CAPTIONMODELNAME: ${APP_NVINGEST_CAPTIONMODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
+      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1
+      APP_NVINGEST_CAPTIONENDPOINTURL: ${APP_NVINGEST_CAPTIONENDPOINTURL:-"http://vlm-ms:8000/v1/chat/completions"}
+
+      # Choose whether to store the extracted content in the vector store for citation support
+      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
+
+      # Choose the summary model to use for document summary
+      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3_3-nemotron-super-49b-v1_5}
+      SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-"nim-llm:8000"}
+      SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-50000}
+      SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-200}
+      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
+      LOGLEVEL: ${LOGLEVEL:-INFO}
+
+      # [Optional] Redis configuration for task status and result storage
+      REDIS_HOST: ${REDIS_HOST:-redis}
+      REDIS_PORT: ${REDIS_PORT:-6379}
+      REDIS_DB: ${REDIS_DB:-0}
+
+      # Bulk upload to MinIO
+      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
+      TEMP_DIR: ${TEMP_DIR:-/tmp-data}
+
+      # NV-Ingest Batch Mode Configurations
+      NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
+      NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
+
+    ports:
+      - "8082:8082"
+    expose:
+      - "8082"
+    shm_size: 5gb
+
+  redis:
+    image: "redis/redis-stack:7.2.0-v18"
+    ports:
+      - "6379:6379"
+
+  nv-ingest-ms-runtime:
+    image: nvcr.io/nvstaging/nim/nv-ingest:25.8.0-RC6
+    cpuset: "0-15"
+    volumes:
+      - ${DATASET_ROOT:-./data}:/workspace/data
+    ports:
+      # HTTP API
+      - "7670:7670"
+      # Simple Broker
+      - "7671:7671"
+    cap_add:
+      - sys_nice
+    environment:
+      # Audio model not used in this RAG version
+      - AUDIO_GRPC_ENDPOINT=audio:50051
+      - AUDIO_INFER_PROTOCOL=grpc
+      - CUDA_VISIBLE_DEVICES=0
+      - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
+      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}}
+      # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
+      - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}}
+      # - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-vlm-embedding-ms:8000/v1}}
+      - INGEST_LOG_LEVEL=WARNING
+      - INGEST_RAY_LOG_LEVEL=PRODUCTION
+      - INGEST_EDGE_BUFFER_SIZE=64
+      - INGEST_DYNAMIC_MEMORY_THRESHOLD=0.8
+      - INGEST_DISABLE_DYNAMIC_SCALING=${INGEST_DISABLE_DYNAMIC_SCALING:-True}
+      - INSTALL_AUDIO_EXTRACTION_DEPS=true
+      # Message client for development
+      #- MESSAGE_CLIENT_HOST=0.0.0.0
+      #- MESSAGE_CLIENT_PORT=7671
+      #- MESSAGE_CLIENT_TYPE=simple # Configure the ingest service to use the simple broker
+      # Message client for production
+      - MESSAGE_CLIENT_HOST=redis
+      - MESSAGE_CLIENT_PORT=6379
+      - MESSAGE_CLIENT_TYPE=redis
+      - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
+      - MRC_IGNORE_NUMA_CHECK=1
+      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=${NEMORETRIEVER_PARSE_HTTP_ENDPOINT:-http://nemoretriever-parse:8000/v1/chat/completions}
+      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=${NEMORETRIEVER_PARSE_INFER_PROTOCOL:-http}
+      - NEMORETRIEVER_PARSE_MODEL_NAME=${NEMORETRIEVER_PARSE_MODEL_NAME:-nvidia/nemoretriever-parse}
+      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NVIDIA_BUILD_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
+      - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
+      # Self-hosted ocr endpoints.
+      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-${PADDLE_GRPC_ENDPOINT:-paddle:8001}}
+      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-${PADDLE_HTTP_ENDPOINT:-http://paddle:8000/v1/infer}}
+      - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-${PADDLE_INFER_PROTOCOL:-grpc}}
+      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-paddle}
+      # build.nvidia.com hosted ocr endpoints.
+      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
+      #- OCR_INFER_PROTOCOL=http
+      - READY_CHECK_ALL_COMPONENTS=False
+      - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
+      # Self-hosted redis endpoints.
+      # build.nvidia.com hosted yolox endpoints.
+      #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2
+      #- YOLOX_INFER_PROTOCOL=http
+      - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
+      - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
+      - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
+      # build.nvidia.com hosted yolox-graphics-elements endpoints.
+      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
+      #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
+      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
+      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
+      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
+      # build.nvidia.com hosted  yolox-table-elements endpoints.
+      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
+      #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
+      - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
+      - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
+      - YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=${YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL:-grpc}
+      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1/chat/completions
+      - VLM_CAPTION_ENDPOINT=${VLM_CAPTION_ENDPOINT:-http://vlm-ms:8000/v1/chat/completions}
+      - VLM_CAPTION_MODEL_NAME=${VLM_CAPTION_MODEL_NAME:-nvidia/llama-3.1-nemotron-nano-vl-8b-v1}
+      - MODEL_PREDOWNLOAD_PATH=${MODEL_PREDOWNLOAD_PATH:-/workspace/models/}
+    healthcheck:
+      test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
+      interval: 10s
+      timeout: 5s
+      retries: 20
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml b/examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml
new file mode 100644
index 000000000..334623c04
--- /dev/null
+++ b/examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml
@@ -0,0 +1,186 @@
+services:
+
+  # Main orchestrator server which stiches together all calls to different services to fulfill the user request
+  rag-server:
+    container_name: rag-server
+    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../
+      dockerfile: src/nvidia_rag/rag_server/Dockerfile
+    # start the server on port 8081 with 8 workers for improved latency on concurrent requests.
+    command: --port 8081 --host 0.0.0.0 --workers 8
+    volumes:
+      # Mount the prompt.yaml file to the container, path should be absolute
+      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
+    # Common customizations to the pipeline can be controlled using env variables
+    environment:
+      # Path to example directory relative to root
+      EXAMPLE_PATH: './nvidia_rag/rag_server'
+
+      # Absolute path to custom prompt.yaml file
+      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
+
+      ##===MINIO specific configurations which is used to store the multimodal base64 content===
+      MINIO_ENDPOINT: "minio:9010"
+      MINIO_ACCESSKEY: "minioadmin"
+      MINIO_SECRETKEY: "minioadmin"
+
+      ##===Vector DB specific configurations===
+      # URL on which vectorstore is hosted
+      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
+      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
+      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
+      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
+      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
+      # Type of index to be used for vectorstore
+      APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
+      # Type of vectordb search to be used
+      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
+      # vectorstore collection name to store embeddings
+      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
+      APP_RETRIEVER_SCORETHRESHOLD: 0.25
+      # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
+      VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
+
+      ##===LLM Model specific configurations===
+      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # url on which llm model is hosted. If "", Nvidia hosted API is used
+      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
+
+      ##===Query Rewriter Model specific configurations===
+      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
+      # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
+      APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b:8000"}
+
+      ##===Filter Expression Generator Model specific configurations===
+      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
+      APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
+      # enable filter expression generator for natural language to filter expression conversion
+      ENABLE_FILTER_GENERATOR: ${ENABLE_FILTER_GENERATOR:-False}
+
+      ##===Embedding Model specific configurations===
+      # url on which embedding model is hosted. If "", Nvidia hosted API is used
+      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
+      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
+      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
+      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
+
+      ##===Reranking Model specific configurations===
+      # url on which ranking model is hosted. If "", Nvidia hosted API is used
+      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
+      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
+      ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
+      # Default confidence threshold for filtering documents by reranker relevance scores (0.0 to 1.0)
+      RERANKER_CONFIDENCE_THRESHOLD: ${RERANKER_CONFIDENCE_THRESHOLD:-0.0}
+
+      ##===VLM Model specific configurations===
+      ENABLE_VLM_INFERENCE: ${ENABLE_VLM_INFERENCE:-false}
+      # Reasoning gate on VLM response: off by default; enable to mitigate incorrect VLM outputs
+      ENABLE_VLM_RESPONSE_REASONING: ${ENABLE_VLM_RESPONSE_REASONING:-false}
+      # Max images sent to VLM per request (query + context)
+      APP_VLM_MAX_TOTAL_IMAGES: ${APP_VLM_MAX_TOTAL_IMAGES:-4}
+      # Max number of query images to include in VLM input
+      APP_VLM_MAX_QUERY_IMAGES: ${APP_VLM_MAX_QUERY_IMAGES:-1}
+      # Max number of context images to include in VLM input
+      APP_VLM_MAX_CONTEXT_IMAGES: ${APP_VLM_MAX_CONTEXT_IMAGES:-1}
+      APP_VLM_SERVERURL: ${APP_VLM_SERVERURL-"http://vlm-ms:8000/v1"}
+      APP_VLM_MODELNAME: ${APP_VLM_MODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
+
+      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
+
+      # Number of document chunks to insert in LLM prompt, used only when ENABLE_RERANKER is set to True
+      APP_RETRIEVER_TOPK: ${APP_RETRIEVER_TOPK:-10}
+
+      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
+      LOGLEVEL: ${LOGLEVEL:-INFO}
+
+      # enable multi-turn conversation in the rag chain - this controls conversation history usage
+      # while doing query rewriting and in LLM prompt
+      ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
+
+      # enable query rewriting for multiturn conversation in the rag chain.
+      # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
+      ENABLE_QUERYREWRITER: ${ENABLE_QUERYREWRITER:-False}
+
+      # Choose whether to enable citations in the response
+      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
+
+      # Choose whether to enable/disable guardrails
+      ENABLE_GUARDRAILS: ${ENABLE_GUARDRAILS:-False}
+
+      # NeMo Guardrails URL when ENABLE_GUARDRAILS is true
+      NEMO_GUARDRAILS_URL: ${NEMO_GUARDRAILS_URL:-nemo-guardrails-microservice:7331}
+
+      # number of last n chat messages to consider from the provided conversation history
+      CONVERSATION_HISTORY: 5
+
+      # Tracing
+      APP_TRACING_ENABLED: "False"
+      # HTTP endpoint
+      APP_TRACING_OTLPHTTPENDPOINT: http://otel-collector:4318/v1/traces
+      # GRPC endpoint
+      APP_TRACING_OTLPGRPCENDPOINT: grpc://otel-collector:4317
+
+      # Choose whether to enable source metadata in document content during generation
+      ENABLE_SOURCE_METADATA: ${ENABLE_SOURCE_METADATA:-true}
+
+      # Whether to filter content within <think></think> tags in model responses
+      FILTER_THINK_TOKENS: ${FILTER_THINK_TOKENS:-true}
+
+      # Whether to enable thinking in the rag chain for llama-3.3-nemotron-super-49b model
+      ENABLE_NEMOTRON_THINKING: ${ENABLE_NEMOTRON_THINKING:-false}
+
+      # enable reflection (context relevance and response groundedness checking) in the rag chain
+      ENABLE_REFLECTION: ${ENABLE_REFLECTION:-false}
+      # Maximum number of context relevance loop iterations
+      MAX_REFLECTION_LOOP: ${MAX_REFLECTION_LOOP:-3}
+      # Minimum relevance score threshold (0-2)
+      CONTEXT_RELEVANCE_THRESHOLD: ${CONTEXT_RELEVANCE_THRESHOLD:-1}
+      # Minimum groundedness score threshold (0-2)
+      RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
+      # reflection llm
+      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
+      # reflection llm server url. If "", Nvidia hosted API is used
+      REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
+      # enable iterative query decomposition
+      ENABLE_QUERY_DECOMPOSITION: ${ENABLE_QUERY_DECOMPOSITION:-false}
+      # maximum recursion depth for iterative query decomposition
+      MAX_RECURSION_DEPTH: ${MAX_RECURSION_DEPTH:-3}
+
+    ports:
+      - "8081:8081"
+    expose:
+      - "8081"
+    shm_size: 5gb
+
+  # Sample UI container which interacts with APIs exposed by rag-server container
+  rag-playground:
+    container_name: rag-playground
+    image: nvcr.io/nvstaging/blueprint/rag-playground:${TAG:-2.3.0.rc0}
+    build:
+      # Set context to repo's root directory
+      context: ../../frontend
+      dockerfile: ./Dockerfile
+      args:
+        # Environment variables for Vite build
+        VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
+        VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
+        VITE_MILVUS_URL: http://milvus:19530
+    ports:
+      - "8090:3000"
+    expose:
+      - "3000"
+    environment:
+      # Runtime environment variables for Vite
+      VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
+      VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
+      VITE_MILVUS_URL: http://milvus:19530
+    depends_on:
+      - rag-server
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/RAG/library_rag/deploy/nims.yaml b/examples/RAG/library_rag/deploy/nims.yaml
new file mode 100644
index 000000000..9fafe77a3
--- /dev/null
+++ b/examples/RAG/library_rag/deploy/nims.yaml
@@ -0,0 +1,348 @@
+services:
+  nim-llm:
+    container_name: nim-llm-ms
+    image: nvcr.io/nim/nvidia/llama-3_3-nemotron-super-49b-v1_5:1.12.0
+    volumes:
+    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
+    user: "${USERID}"
+    ports:
+    - "8999:8000"
+    expose:
+    - "8000"
+    environment:
+      NGC_API_KEY: ${NGC_API_KEY}
+    shm_size: 20gb
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              #count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${LLM_MS_GPU_ID:-1}']
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8000/v1/health/ready')"]
+      interval: 10s
+      timeout: 20s
+      retries: 100
+    profiles: ["", "rag"]
+
+  nemoretriever-embedding-ms:
+    container_name: nemoretriever-embedding-ms
+    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.9.0
+    volumes:
+    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
+    ports:
+    - "9080:8000"
+    expose:
+    - "8000"
+    environment:
+      NGC_API_KEY: ${NGC_API_KEY}
+      NIM_TRT_ENGINE_HOST_CODE_ALLOWED: 1
+    user: "${USERID}"
+    shm_size: 16GB
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${EMBEDDING_MS_GPU_ID:-0}']
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+      start_period: 10m
+    profiles: ["", "rag", "ingest", "text-embed"]
+
+  nemoretriever-vlm-embedding-ms:
+    container_name: nemoretriever-vlm-embedding-ms
+    image: nvcr.io/nvidia/nemo-microservices/llama-3.2-nemoretriever-1b-vlm-embed-v1:1.7.0
+    volumes:
+    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
+    ports:
+    - "9081:8000"
+    expose:
+    - "8000"
+    environment:
+      NGC_API_KEY: ${NGC_API_KEY}
+      NIM_TRT_ENGINE_HOST_CODE_ALLOWED: 1
+    user: "${USERID}"
+    shm_size: 16GB
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${VLM_EMBEDDING_MS_GPU_ID:-0}']
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+      start_period: 10m
+    profiles: ["vlm-embed"]
+
+  nemoretriever-ranking-ms:
+    container_name: nemoretriever-ranking-ms
+    image: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:1.7.0
+    volumes:
+    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
+    ports:
+    - "1976:8000"
+    expose:
+    - "8000"
+    environment:
+      NGC_API_KEY: ${NGC_API_KEY}
+    user: "${USERID}"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
+      interval: 10s
+      timeout: 20s
+      retries: 100
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${RANKING_MS_GPU_ID:-0}']
+              capabilities: [gpu]
+    profiles: ["", "rag"]
+
+  page-elements:
+    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-page-elements-v2}:${YOLOX_TAG:-1.4.0}
+    ports:
+      - "8000:8000"
+      - "8001:8001"
+      - "8002:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_MODEL_BATCH_SIZE=${PAGE_ELEMENTS_BATCH_SIZE:-1}
+      # NIM OpenTelemetry Settings
+      - NIM_OTEL_SERVICE_NAME=page-elements
+      - NIM_OTEL_TRACES_EXPORTER=otlp
+      - NIM_OTEL_METRICS_EXPORTER=console
+      - NIM_OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
+      - NIM_ENABLE_OTEL=true
+      # Triton OpenTelemetry Settings
+      - TRITON_OTEL_URL=http://otel-collector:4318/v1/traces
+      - TRITON_OTEL_RATE=1
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['${YOLOX_MS_GPU_ID:-0}']
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["", "ingest"]
+
+  graphic-elements:
+    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.4.0}
+    ports:
+      - "8003:8000"
+      - "8004:8001"
+      - "8005:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_MODEL_BATCH_SIZE=${GRAPHIC_ELEMENTS_BATCH_SIZE:-1}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['${YOLOX_GRAPHICS_MS_GPU_ID:-0}']
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["", "ingest"]
+
+  table-structure:
+    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.4.0}
+    ports:
+      - "8006:8000"
+      - "8007:8001"
+      - "8008:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - CUDA_VISIBLE_DEVICES=0
+      - NIM_TRITON_MODEL_BATCH_SIZE=${TABLE_STRUCTURE_BATCH_SIZE:-1}
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids:  ['${YOLOX_TABLE_MS_GPU_ID:-0}']
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["", "ingest"]
+
+  paddle:
+    image: ${PADDLE_IMAGE:-nvcr.io/nim/baidu/paddleocr}:${PADDLE_TAG:-1.4.0}
+    shm_size: 2gb
+    ports:
+      - "8009:8000"
+      - "8010:8001"
+      - "8011:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids:  ['${OCR_MS_GPU_ID:-${PADDLE_MS_GPU_ID:-0}}']
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["", "ingest"]
+
+  nemoretriever-ocr:
+    image: ${NEMORETRIEVER_OCR_IMAGE:-nvcr.io/nvidia/nemo-microservices/nemoretriever-ocr-v1}:${NEMORETRIEVER_OCR_TAG:-1.0.0}
+    shm_size: 2gb
+    ports:
+      - "8012:8000"
+      - "8013:8001"
+      - "8014:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${OCR_MS_GPU_ID:-${PADDLE_MS_GPU_ID:-0}}"]
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["nemoretriever-ocr"]
+
+  # Optional NIM microservices
+  nemoretriever-parse:
+    image: ${NEMORETRIEVER_PARSE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-parse}:${NEMORETRIEVER_PARSE_TAG:-1.2}
+    ports:
+      - "8015:8000"
+      - "8016:8001"
+      - "8017:8002"
+    user: root
+    environment:
+      - NIM_HTTP_API_PORT=8000
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ['${NEMORETRIEVER_PARSE_MS_GPU_ID:-1}']
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["nemoretriever-parse"]
+
+  audio:
+    image: ${AUDIO_IMAGE:-nvcr.io/nim/nvidia/riva-asr}:${AUDIO_TAG:-1.3.0}
+    shm_size: 2gb
+    ports:
+      - "8021:50051"  # grpc
+      - "8022:9000"  # http
+    user: root
+    environment:
+      - NIM_TAGS_SELECTOR=name=parakeet-1-1b-ctc-riva-en-us,mode=ofl
+      - NIM_TRITON_LOG_VERBOSE=1
+      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
+      - CUDA_VISIBLE_DEVICES=0
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              device_ids: ["${AUDIO_MS_GPU_ID:-0}"]
+              capabilities: [gpu]
+    runtime: nvidia
+    profiles: ["audio"]
+
+  vlm-ms:
+    container_name: nemo-vlm-microservice
+    image: nvcr.io/nim/nvidia/llama-3.1-nemotron-nano-vl-8b-v1:1.3.1
+    volumes:
+    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
+    ports:
+    - "1977:8000"
+    expose:
+    - "8000"
+    environment:
+      NGC_API_KEY: ${NGC_API_KEY}
+    user: "${USERID}"
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8000/v1/health/ready')"]
+      interval: 10s
+      timeout: 20s
+      retries: 100
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${VLM_MS_GPU_ID:-5}']
+              capabilities: [gpu]
+    profiles: ["vlm"]
+
+  nim-llm-llama-8b:
+    container_name: nim-llm-llama-8b
+    image: nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
+    volumes:
+    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
+    user: "${USERID}"
+    ports:
+    - "8991:8000"
+    expose:
+    - "8000"
+    environment:
+      NGC_API_KEY: ${NGC_API_KEY}
+    shm_size: 20gb
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              #count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${LLM_8B_MS_GPU_ID:-6}']
+              capabilities: [gpu]
+    healthcheck:
+      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8000/v1/health/ready')"]
+      interval: 10s
+      timeout: 20s
+      retries: 100
+    profiles: ["llama-8b"]
+
+networks:
+  default:
+    name: nvidia-rag
diff --git a/examples/RAG/library_rag/deploy/vectordb.yaml b/examples/RAG/library_rag/deploy/vectordb.yaml
new file mode 100644
index 000000000..ed9bf8403
--- /dev/null
+++ b/examples/RAG/library_rag/deploy/vectordb.yaml
@@ -0,0 +1,102 @@
+services:
+
+  # Milvus can be made GPU accelerated by uncommenting the lines as specified below
+  milvus:
+    container_name: milvus-standalone
+    image: milvusdb/milvus:${MILVUS_VERSION:-v2.6.0-gpu} # milvusdb/milvus:v2.6.0 for CPU
+    command: ["milvus", "run", "standalone"]
+    environment:
+      ETCD_ENDPOINTS: etcd:2379
+      MINIO_ADDRESS: minio:9010
+      KNOWHERE_GPU_MEM_POOL_SIZE: 2048;4096
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
+    # healthcheck:
+    #   test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
+    #   interval: 30s
+    #   start_period: 90s
+    #   timeout: 20s
+    #   retries: 3
+    ports:
+      - "19530:19530"
+      - "9091:9091"
+    depends_on:
+      - "etcd"
+      - "minio"
+    # Comment out this section if CPU based image is used and set below env variables to False
+    # export APP_VECTORSTORE_ENABLEGPUSEARCH=False
+    # export APP_VECTORSTORE_ENABLEGPUINDEX=False
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              capabilities: ["gpu"]
+              # count: ${INFERENCE_GPU_COUNT:-all}
+              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
+    profiles: ["", "milvus"]
+
+  etcd:
+    container_name: milvus-etcd
+    image: quay.io/coreos/etcd:v3.6.4
+    environment:
+      - ETCD_AUTO_COMPACTION_MODE=revision
+      - ETCD_AUTO_COMPACTION_RETENTION=1000
+      - ETCD_QUOTA_BACKEND_BYTES=4294967296
+      - ETCD_SNAPSHOT_COUNT=50000
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
+    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
+    healthcheck:
+      test: ["CMD", "etcdctl", "endpoint", "health"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    profiles: ["", "milvus"]
+
+  minio:
+    container_name: milvus-minio
+    image: minio/minio:RELEASE.2025-07-23T15-54-02Z
+    environment:
+      MINIO_ACCESS_KEY: minioadmin
+      MINIO_SECRET_KEY: minioadmin
+    ports:
+      - "9011:9011"
+      - "9010:9010"
+    volumes:
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
+    command: minio server /minio_data --console-address ":9011" --address ":9010"
+    healthcheck:
+      test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
+      interval: 30s
+      timeout: 20s
+      retries: 3
+    profiles: ["", "milvus", "elasticsearch", "minio"]
+
+  elasticsearch:
+    container_name: elasticsearch
+    image: "docker.elastic.co/elasticsearch/elasticsearch:9.0.3"
+    ports:
+      - 9200:9200
+    volumes:
+      # Run "sudo chown -R 1000:1000 deploy/compose/volumes/elasticsearch/" to fix permissions
+      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/elasticsearch:/usr/share/elasticsearch/data
+    restart: on-failure
+    environment:
+      - discovery.type=single-node
+      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
+      - xpack.security.enabled=false
+      - xpack.license.self_generated.type=basic
+      - network.host=0.0.0.0
+      - cluster.routing.allocation.disk.threshold_enabled=false
+    hostname: elasticsearch
+    healthcheck:
+      test: ["CMD", "curl", "-s", "-f", "http://localhost:9200/_cat/health"]
+      interval: 10s
+      timeout: 1s
+      retries: 10
+    profiles: ["elasticsearch"]
+
+networks:
+  default:
+    name: nvidia-rag
\ No newline at end of file
diff --git a/examples/RAG/library_rag/pyproject.toml b/examples/RAG/library_rag/pyproject.toml
new file mode 100644
index 000000000..3ebf38a33
--- /dev/null
+++ b/examples/RAG/library_rag/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools >= 64", "setuptools-scm>=8"]
+
+[tool.setuptools_scm]
+# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
+# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
+git_describe_command = "git describe --long --first-parent"
+root = "../../.."
+
+[project]
+name = "library_rag"
+dynamic = ["version"]
+dependencies = [
+  "nvidia-nat[langchain]~=1.3",
+]
+requires-python = ">=3.11,<3.14"
+description = "Custom NeMo Agent Toolkit Workflow"
+classifiers = ["Programming Language :: Python"]
+
+[tool.uv.sources]
+nvidia-nat = { path = "../../..", editable = true }
+
+[project.entry-points.'nat.components']
+library_rag = "library_rag.register"
\ No newline at end of file
diff --git a/examples/RAG/library_rag/src/library_rag/__init__.py b/examples/RAG/library_rag/src/library_rag/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/RAG/library_rag/src/library_rag/configs/config.yml b/examples/RAG/library_rag/src/library_rag/configs/config.yml
new file mode 100644
index 000000000..d4bd4351b
--- /dev/null
+++ b/examples/RAG/library_rag/src/library_rag/configs/config.yml
@@ -0,0 +1,31 @@
+general:
+  use_uvloop: true
+
+
+functions:
+  library_rag_tool:
+    _type: library_rag
+    base_url: "http://localhost:8081"
+    reranker_top_k: 2
+    vdb_top_k: 10
+    vdb_endpoint: "http://milvus:19530"
+    collection_names: ["cuda"]
+    enable_query_rewriting: True
+    enable_reranker: True
+
+    #description: Retrieve documents given the input query
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 4096
+    top_p: 1
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - library_rag_tool
+  llm_name: nim_llm
+  verbose: true
\ No newline at end of file
diff --git a/examples/RAG/library_rag/src/library_rag/library_rag_function.py b/examples/RAG/library_rag/src/library_rag/library_rag_function.py
new file mode 100644
index 000000000..b9c66576e
--- /dev/null
+++ b/examples/RAG/library_rag/src/library_rag/library_rag_function.py
@@ -0,0 +1,82 @@
+import logging
+
+from pydantic import Field
+
+from nat.builder.builder import Builder
+from nat.builder.function_info import FunctionInfo
+from nat.cli.register_workflow import register_function
+from nat.data_models.function import FunctionBaseConfig
+
+logger = logging.getLogger(__name__)
+
+
+class LibraryRagFunctionConfig(FunctionBaseConfig, name="library_rag"):
+    """
+    NAT function template. Please update the description.
+    """
+    base_url: str = Field(description="Local / Custom RAG URL")
+    #prompt: str = Field(default="Hello", description="The prompt")
+    reranker_top_k: int = Field(default=2, description="Maximum number of records to be retrieved") #TODO: Modify the descriptions
+    vdb_top_k: int = Field(default=10, description="Maximum number of records to be retrieved")
+    vdb_endpoint: str = Field(default="", description="Maximum number of records to be retrieved")
+    collection_names: list[str] = Field(default="1", description="Maximum number of records to be retrieved")
+    enable_query_rewriting: bool = Field(default=True, description="Maximum number of records to be retrieved")
+    enable_reranker: bool = Field(default=True, description="Maximum number of records to be retrieved")
+
+@register_function(config_type=LibraryRagFunctionConfig)
+def library_rag_function(
+    config: LibraryRagFunctionConfig, builder: Builder
+):
+    import aiohttp
+    # Implement your function logic here
+    async def _response_fn(query: str) -> str:
+        url = f"{config.base_url}/v1/search"
+        payload={
+        "query": f"{query}",
+        "reranker_top_k": f"{config.reranker_top_k}",
+        "vdb_top_k": f"{config.vdb_top_k}",
+        "vdb_endpoint": f"{config.vdb_endpoint}",
+        "collection_names": f"{config.collection_names}", # Multiple collection retrieval can be used by passing multiple collection names
+        "enable_query_rewriting": f"{config.enable_query_rewriting}",
+        "enable_reranker": f"{config.enable_reranker}",}
+
+        logger.info("Your query is %s", query)
+
+
+        async with aiohttp.ClientSession() as session:
+            try:
+                logger.debug("Sending request to the RAG endpoint %s", url)
+
+                #async with session.post(url=url, json=payload) as response:
+                results = await session.post(url=url, json=payload).json()
+
+                logger.info("The results are %s", results)
+
+                if results["total_results"] == 0:
+                    yield ""
+
+                # parse docs from LangChain/LangGraph Document object to string
+                parsed_docs = []
+
+                # iterate over results and store parsed content
+
+                num_records = results["total_results"]
+                records = results["results"]
+                for i in range(num_records):
+                    document_id = records[i]["document_id"]
+                    content = records[i]["content"]
+                    parsed_document = f'<Document"/> document_id={document_id}\n"{content}\n</Document>'
+                    parsed_docs.append(parsed_document)
+
+                # combine parsed documents into a single string
+                internal_search_docs = "\n\n---\n\n".join(parsed_docs)
+                yield internal_search_docs
+
+
+            except aiohttp.ClientError as e:
+                print(f"Error: {e}")
+
+        yield FunctionInfo.from_fn(
+                _response_fn,
+                description=("This tool retrieves relevant documents for a given user query."
+                            "This will return relevant documents from the selected collection."))
diff --git a/examples/RAG/library_rag/src/library_rag/register.py b/examples/RAG/library_rag/src/library_rag/register.py
new file mode 100644
index 000000000..cd7276447
--- /dev/null
+++ b/examples/RAG/library_rag/src/library_rag/register.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+# Import any tools which need to be automatically registered here
+from library_rag import library_rag_function
\ No newline at end of file
diff --git a/examples/RAG/library_rag/tests/__init__.py b/examples/RAG/library_rag/tests/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/RAG/library_rag/tests/conftest.py b/examples/RAG/library_rag/tests/conftest.py
new file mode 100644
index 000000000..3dab6d46c
--- /dev/null
+++ b/examples/RAG/library_rag/tests/conftest.py
@@ -0,0 +1,78 @@
+import pytest
+import asyncio
+from unittest.mock import Mock, AsyncMock
+from aiohttp import ClientSession
+from aioresponses import aioresponses
+
+from library_rag.library_rag_function import LibraryRagFunctionConfig
+from nat.builder.builder import Builder
+
+
+@pytest.fixture
+def event_loop():
+    """Create an instance of the default event loop for the test session."""
+    loop = asyncio.get_event_loop_policy().new_event_loop()
+    yield loop
+    loop.close()
+
+
+@pytest.fixture
+def mock_builder():
+    """Mock Builder instance for testing."""
+    return Mock(spec=Builder)
+
+
+@pytest.fixture
+def default_config():
+    """Default configuration for testing."""
+    return LibraryRagFunctionConfig(
+        base_url="http://localhost:8081",
+        reranker_top_k=2,
+        vdb_top_k=10,
+        vdb_endpoint="http://milvus:19530",
+        collection_names=["test_collection"],
+        enable_query_rewriting=True,
+        enable_reranker=True
+    )
+
+
+@pytest.fixture
+def minimal_config():
+    """Minimal configuration for testing."""
+    return LibraryRagFunctionConfig(
+        base_url="http://localhost:8081"
+    )
+
+
+@pytest.fixture
+def sample_rag_response():
+    """Sample RAG API response."""
+    return {
+        "total_results": 2,
+        "results": [
+            {
+                "document_id": "doc_1",
+                "content": "This is the first document content about CUDA programming."
+            },
+            {
+                "document_id": "doc_2", 
+                "content": "This is the second document content about GPU acceleration."
+            }
+        ]
+    }
+
+
+@pytest.fixture
+def empty_rag_response():
+    """Empty RAG API response."""
+    return {
+        "total_results": 0,
+        "results": []
+    }
+
+
+@pytest.fixture
+def mock_aiohttp_session():
+    """Mock aiohttp session for testing."""
+    with aioresponses() as m:
+        yield m
diff --git a/examples/RAG/library_rag/tests/test_configs/test_config.yml b/examples/RAG/library_rag/tests/test_configs/test_config.yml
new file mode 100644
index 000000000..b76572d8c
--- /dev/null
+++ b/examples/RAG/library_rag/tests/test_configs/test_config.yml
@@ -0,0 +1,28 @@
+general:
+  use_uvloop: true
+
+functions:
+  library_rag_tool:
+    _type: library_rag
+    base_url: "http://localhost:8081"
+    reranker_top_k: 3
+    vdb_top_k: 15
+    vdb_endpoint: "http://test-milvus:19530"
+    collection_names: ["test_collection_1", "test_collection_2"]
+    enable_query_rewriting: False
+    enable_reranker: True
+
+llms:
+  test_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0.1
+    max_tokens: 2048
+    top_p: 0.9
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - library_rag_tool
+  llm_name: test_llm
+  verbose: false
diff --git a/examples/RAG/library_rag/tests/test_integration.py b/examples/RAG/library_rag/tests/test_integration.py
new file mode 100644
index 000000000..99e49f8f1
--- /dev/null
+++ b/examples/RAG/library_rag/tests/test_integration.py
@@ -0,0 +1,112 @@
+import pytest
+import asyncio
+import aiohttp
+import os
+import json
+from unittest.mock import Mock, patch
+import yaml
+from pathlib import Path
+
+from library_rag.library_rag_function import library_rag_function, LibraryRagFunctionConfig
+from nat.builder.builder import Builder
+
+
+class TestIntegration:
+    """Integration tests for the library RAG function."""
+
+    async def fetch_health_status(self, rag_endpoint, check_dependencies=True) -> bool:
+        url = f"{rag_endpoint}/v1/health"
+        params = {"check_dependencies": str(check_dependencies)}
+        
+        async with aiohttp.ClientSession() as session:
+            async with session.get(url, params=params) as response:
+                response.raise_for_status()
+                result = await response.json()
+        
+        if result.get("message") == "Service is up":
+            return True
+        else:
+            print("Basic health check failed")
+            return False
+        
+    
+    @pytest.fixture
+    def config_from_yaml(self):
+        """Load configuration from YAML file for integration testing."""
+        # Load the actual config file
+        config_path = Path(__file__).parent.parent / "src" / "library_rag" / "configs" / "config.yml"
+        
+        with open(config_path, 'r') as f:
+            yaml_config = yaml.safe_load(f)
+        
+        # Extract the function config
+        function_config = yaml_config['functions']['library_rag_tool']
+        
+        # Remove the _type field as it's not part of our config model
+        function_config.pop('_type', None)
+        
+        return LibraryRagFunctionConfig(**function_config)
+
+
+    @pytest.mark.asyncio
+    async def test_end_to_end_workflow_mock(self, config_from_yaml):
+        """Test end-to-end workflow with mocked HTTP responses."""
+        mock_builder = Mock(spec=Builder)
+        
+        # Mock a realistic RAG response
+        mock_response = {
+            "total_results": 2,
+            "results": [
+                {
+                    "document_id": "cuda_guide_1",
+                    "content": "CUDA (Compute Unified Device Architecture) is a parallel computing platform and API model created by NVIDIA."
+                },
+                {
+                    "document_id": "cuda_guide_2", 
+                    "content": "CUDA allows software developers to use a CUDA-enabled graphics processing unit for general purpose processing."
+                }
+            ]
+        }
+        
+        from aioresponses import aioresponses
+        
+        url = f"{config_from_yaml.base_url}/v1/search"
+        
+        with aioresponses() as mock_http:
+            mock_http.post(url, payload=mock_response)
+            
+            # Initialize the function
+            result_generator = library_rag_function(config_from_yaml, mock_builder)
+            
+            # Get the response function
+            response_function = None
+            async for item in result_generator:
+                if hasattr(item, 'fn'):
+                    response_function = item.fn
+                    break
+            
+            assert response_function is not None
+            
+            # Test a realistic query
+            query = "What is CUDA and how does it work?"
+            
+            results = []
+            async for result in response_function(query):
+                results.append(result)
+            
+            # Verify the response structure
+            assert len(results) >= 1
+            document_content = results[0]
+            
+            # Check that both documents are included
+            assert "cuda_guide_1" in document_content
+            assert "cuda_guide_2" in document_content
+            assert "CUDA (Compute Unified Device Architecture)" in document_content
+            assert "graphics processing unit" in document_content
+            
+            # Check proper formatting
+            assert document_content.count("<Document/>") == 2
+            assert document_content.count("</Document>") == 2
+            assert "\n\n---\n\n" in document_content
+
+    
\ No newline at end of file
diff --git a/examples/local_rag/configs b/examples/local_rag/configs
new file mode 120000
index 000000000..467f0fb12
--- /dev/null
+++ b/examples/local_rag/configs
@@ -0,0 +1 @@
+/Users/nhennouni/Desktop/NeMo-Agent-Toolkit/examples/local_rag/src/local_rag/configs
\ No newline at end of file
diff --git a/examples/local_rag/data b/examples/local_rag/data
new file mode 120000
index 000000000..32225d4f0
--- /dev/null
+++ b/examples/local_rag/data
@@ -0,0 +1 @@
+/Users/nhennouni/Desktop/NeMo-Agent-Toolkit/examples/local_rag/src/local_rag/data
\ No newline at end of file
diff --git a/examples/local_rag/pyproject.toml b/examples/local_rag/pyproject.toml
new file mode 100644
index 000000000..6177599d0
--- /dev/null
+++ b/examples/local_rag/pyproject.toml
@@ -0,0 +1,25 @@
+[build-system]
+build-backend = "setuptools.build_meta"
+requires = ["setuptools >= 64", "setuptools-scm>=8"]
+
+[tool.setuptools_scm]
+# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
+# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
+git_describe_command = "git describe --long --first-parent"
+root = "../.."
+
+[project]
+name = "local_rag"
+dynamic = ["version"]
+dependencies = [
+  "nvidia-nat[langchain]~=1.3",
+]
+requires-python = ">=3.11,<3.14"
+description = "Custom NeMo Agent Toolkit Workflow"
+classifiers = ["Programming Language :: Python"]
+
+[tool.uv.sources]
+nvidia-nat = { path = "../..", editable = true }
+
+[project.entry-points.'nat.components']
+local_rag = "local_rag.register"
\ No newline at end of file
diff --git a/examples/local_rag/src/local_rag/__init__.py b/examples/local_rag/src/local_rag/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/examples/local_rag/src/local_rag/configs/config.yml b/examples/local_rag/src/local_rag/configs/config.yml
new file mode 100644
index 000000000..cfef8a6f6
--- /dev/null
+++ b/examples/local_rag/src/local_rag/configs/config.yml
@@ -0,0 +1,25 @@
+general:
+  use_uvloop: true
+
+
+functions:
+  local_rag_tool:
+    _type: local_rag
+    base_url: "https://10.184.173.138/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/search"
+    max_records: 5
+    #description: Retrieve documents given the input query
+
+llms:
+  nim_llm:
+    _type: nim
+    model_name: meta/llama-3.3-70b-instruct
+    temperature: 0
+    max_tokens: 4096
+    top_p: 1
+
+workflow:
+  _type: tool_calling_agent
+  tool_names:
+   - local_rag_tool
+  llm_name: nim_llm
+  verbose: true
diff --git a/examples/local_rag/src/local_rag/local_rag_function.py b/examples/local_rag/src/local_rag/local_rag_function.py
new file mode 100644
index 000000000..a5c8ce20a
--- /dev/null
+++ b/examples/local_rag/src/local_rag/local_rag_function.py
@@ -0,0 +1,87 @@
+import logging
+
+import json
+from urllib.parse import quote
+
+from pydantic import Field
+
+from nat.builder.builder import Builder
+from nat.builder.function_info import FunctionInfo
+from nat.cli.register_workflow import register_function
+from nat.data_models.function import FunctionBaseConfig
+
+
+logger = logging.getLogger(__name__)
+
+
+class LocalRagFunctionConfig(FunctionBaseConfig, name="local_rag"):
+    """
+    NAT function template. Please update the description.
+    """
+    
+    base_url: str = Field(description="Local / Custom RAG URL")
+    #prompt: str = Field(default="Hello", description="The prompt")
+    max_records: int = Field(default="1", description="Maximum number of records to be retrieved")
+
+
+@register_function(config_type=LocalRagFunctionConfig)
+async def local_rag_function(
+    config: LocalRagFunctionConfig, builder: Builder
+):
+    import httpx 
+    async with httpx.AsyncClient(verify=False, headers={
+                "accept": "application/json", "Content-Type": "application/json"
+        }) as client:
+
+            async def _response_fn(query: str) -> str:
+                """
+                This tool retrieve relevant context for the given question
+                """
+                logger.info("Your query is %s", query)
+
+                # configure params for RAG endpoint and doc search
+                url = f"{config.base_url}"
+                # payload = {"prompt": quote(query, safe=""), "max_records": config.max_records}
+                payload = {"prompt": query, "max_records": config.max_records}
+
+                # send configured payload to running chain server
+                logger.debug("Sending request to the RAG endpoint %s", url)
+
+                
+                url_encoded_prompt = quote(query, safe="")
+                request =  f"{url}?prompt={url_encoded_prompt}&max_records={config.max_records}"
+
+                logger.info("Your URL is %s", request)
+
+                # response = await client.get(url, params=payload)
+                response = await client.get(request)
+
+                response.raise_for_status()
+                results = response.json()
+
+                logger.info("The results are %s", results)
+
+                if len(results["records"]) == 0:
+                    return ""
+
+                # parse docs from LangChain/LangGraph Document object to string
+                parsed_docs = []
+
+                # iterate over results and store parsed content
+
+                num_records = results["num_records"]
+                records = results["records"]
+                for i in range(num_records):
+                    link = records[i]["_links"]["self"]["href"]
+                    content = records[i]["chunk"]
+                    parsed_document = f'<Document"/> link={link}\n"{content}\n</Document>'
+                    parsed_docs.append(parsed_document)
+
+                # combine parsed documents into a single string
+                internal_search_docs = "\n\n---\n\n".join(parsed_docs)
+                return internal_search_docs
+
+            yield FunctionInfo.from_fn(
+                _response_fn,
+                description=("This tool retrieves relevant documents for a given user query."
+                            "This will return relevant documents from the selected collection."))
\ No newline at end of file
diff --git a/examples/local_rag/src/local_rag/register.py b/examples/local_rag/src/local_rag/register.py
new file mode 100644
index 000000000..565b9d42e
--- /dev/null
+++ b/examples/local_rag/src/local_rag/register.py
@@ -0,0 +1,4 @@
+# flake8: noqa
+
+# Import any tools which need to be automatically registered here
+from local_rag import local_rag_function
\ No newline at end of file
diff --git a/examples/local_rag/src/local_rag/response.json b/examples/local_rag/src/local_rag/response.json
new file mode 100644
index 000000000..3f5dc0239
--- /dev/null
+++ b/examples/local_rag/src/local_rag/response.json
@@ -0,0 +1,55 @@
+{
+    "records": [
+        {
+            "chunk": "were addressed promptly, often within a few hours of contacting support.\r\n**Support responsiveness:** 5/5 ⭐⭐⭐⭐⭐ – The team was always available and provided clear, step-by-step guidance.\r\n**Technical expertise:** 5/5 ⭐⭐⭐⭐⭐ – The support staff had in-depth knowledge of the product and could resolve issues efficiently.\r\n**Installation assistance:** 5/5 ⭐⭐⭐⭐⭐ – The initial setup was straightforward, but the support team was always available for additional help if needed.\r\nThere was one instance in early July 2024 where a minor configuration issue arose, but the support team resolved it within a day, minimizing any disruption to our operations. I would highly recommend the support and service team to anyone considering the DataMaster Pro_storage. Their level of dedication and professionalism has been a key factor in my overall satisfaction with the product.\r\nHow easy was it to use the product?\r\nThe DataMaster Pro_storage was remarkably easy to use right out of the box. The setup process was straightforward, and I was able to get it up and running within a few hours of unboxing.\r\n**Ease of installation:** 5/5 ⭐⭐⭐⭐⭐ – Took less than 2 hours with minimal configuration.\r\n**User interface clarity:** 4.5/5 ⭐⭐⭐⭐⭐ – Very user-friendly with clear instructions and helpful tooltips.\r\n**Learning curve:** Very low – I didn’t need any training or external resources to begin using it effectively.\r\nOne thing I particularly appreciated was the built-in help documentation and online support resources. These were extremely useful",
+            "_links": {
+                "self": {
+                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/17e8cb8e-dfba-4a1e-be85-7b260426e8d8"
+                }
+            },
+            "index": 1,
+            "score": 0.45823032
+        },
+        {
+            "chunk": "**: Any inquiries or issues I had were resolved within minutes, often\nthrough live chat or phone support. The team was not only quick to respond but also provided clear\nand detailed solutions.\n**Training and Resources**: The company provided comprehensive documentation, video tutorials,\nand a dedicated support portal, which I found extremely helpful in maximizing the product's\npotential.\n**Follow-up and Maintenance**: I have not encountered any major issues with the **DataMaster\n\n\nPro_storage**, but when I did have a minor configuration query, the support team followed up to **Overall Satisfaction with Support**: Overall satisfaction with support: 5/5 ?????\nI have not experienced any significant delays or miscommunication with the support team, and their\nlevel of professionalism has been consistently high. I would not hesitate to recommend their service\nto others.\nHow does the product meet your expectations for usability and user interface?\nThe DataMaster Pro_storage has met and even exceeded my expectations in terms of usability and\nuser interface. The interface is intuitive and well-designed, making it easy to navigate even for\nsomeone who is not highly technical.\n**User Interface Design**: The interface is clean, with clearly labeled sections and a logical flow. I\nfound it easy to access the main features without any confusion.\n**Usability Features**: The product includes helpful tooltips, context-sensitive help, and a\nwell-organized dashboard that provides at-a-glance information on storage usage",
+            "_links": {
+                "self": {
+                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/c8b70ada-e564-4090-91a2-8ab87a32aca3"
+                }
+            },
+            "index": 2,
+            "score": 0.47251424
+        },
+        {
+            "chunk": ", performance\nmetrics, and system health.\n**Customization Options**: I appreciate the ability to customize the interface according to my\npreferences, such as changing the layout or adjusting notification settings.\n**Overall Satisfaction with UI/UX**: Overall satisfaction with usability and user interface: 5/5 ?????\nI have been using the DataMaster Pro_storage since early 2024, and I have not encountered any\nsignificant usability issues. The product has been a pleasure to work with, and I believe it sets a high\nstandard for storage solutions in terms of user experience.\nAddress: Jalan Dr. Djunjunan No. 5, Palu, SU 68210\nAddress: Jalan Dr. Djunjunan No. 5, Palu, SU 68210\nDate: 19/07/2025 15:52:25\n\n\n",
+            "_links": {
+                "self": {
+                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/c8b70ada-e564-4090-91a2-8ab87a32aca3"
+                }
+            },
+            "index": 3,
+            "score": 0.49985614
+        },
+        {
+            "chunk": "responsiveness has been\nextremely\nlow, scoring only 1.5/5 ??. It has taken over a week to receive a response on multiple\noccasions,\nwhich has significantly delayed resolution of critical issues.\n**Ease of installation:** The initial setup was straightforward and took less than 2\nhours, scoring 4/5\n????. However, this positive experience was overshadowed by the subsequent issues\nencountered\npost-deployment.\n**Overall satisfaction:** Overall satisfaction with the DataMaster X1_storage has been\nextremely\nlow, scoring 1.8/5 ??. The product has failed to deliver on its promises, especially in\n\n\nterms of\nperformance, reliability, and support.\n[Generated content for section: Customer Contact Info]\nDate: 20/07/2025 12:19:06\n\n\n",
+            "_links": {
+                "self": {
+                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/c043269b-c548-4f3e-a771-28b18fe5f810"
+                }
+            },
+            "index": 4,
+            "score": 0.5283787
+        },
+        {
+            "chunk": "potential damage to the hardware.\r\n**Support During Implementation:** The support team was generally available during the implementation phase, but their response times were slow at times. This made it difficult to address certain issues promptly.\r\n**Overall Satisfaction with Deployment:** Overall satisfaction with the deployment and implementation: 3.8/5 ⭐⭐⭐⭐. While the system performed well in many aspects, there were areas where the process could have been more efficient and user-friendly.\r\nQuantitative feedback rating - on a scale of 1-5)\r\nThe DataMaster Pro_storage has met some of my expectations but has also shown several critical shortcomings that have impacted my overall satisfaction. While it performs well under moderate loads and has a user-friendly interface, the heat management issues and file corruption problems have been persistent and concerning. The system reached temperatures as high as 55°C during sustained use, which is outside the recommended operating range of 20°C to 40°C. This has raised concerns about long-term reliability and potential hardware damage. Additionally, occasional file corruption has occurred, requiring manual recovery efforts and increasing the risk of data loss. Despite these issues, the initial setup was relatively straightforward, and the support team is knowledgeable, though response times are often slow. Overall, I would rate the DataMaster Pro_storage a 3/5, as it has some redeeming qualities but also several critical issues that need to be addressed.\r\nQuantitative",
+            "_links": {
+                "self": {
+                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/2a641eaa-b395-4e05-b062-71c2a7307b19"
+                }
+            },
+            "index": 5,
+            "score": 0.53004134
+        }
+    ],
+    "num_records": 5
+}
\ No newline at end of file

From e262dd9ab50a2c87db481a03030dafbc992d2376 Mon Sep 17 00:00:00 2001
From: Narimane Hennouni <nhennouni@nhennouni-mlt.client.nvidia.com>
Date: Wed, 24 Sep 2025 14:42:06 -0700
Subject: [PATCH 4/6] rag library mode with unit tests and readme

---
 examples/local_rag/configs                    |  1 -
 examples/local_rag/data                       |  1 -
 examples/local_rag/pyproject.toml             | 25 ------
 examples/local_rag/src/local_rag/__init__.py  |  0
 .../src/local_rag/configs/config.yml          | 25 ------
 .../src/local_rag/local_rag_function.py       | 87 -------------------
 examples/local_rag/src/local_rag/register.py  |  4 -
 .../local_rag/src/local_rag/response.json     | 55 ------------
 8 files changed, 198 deletions(-)
 delete mode 120000 examples/local_rag/configs
 delete mode 120000 examples/local_rag/data
 delete mode 100644 examples/local_rag/pyproject.toml
 delete mode 100644 examples/local_rag/src/local_rag/__init__.py
 delete mode 100644 examples/local_rag/src/local_rag/configs/config.yml
 delete mode 100644 examples/local_rag/src/local_rag/local_rag_function.py
 delete mode 100644 examples/local_rag/src/local_rag/register.py
 delete mode 100644 examples/local_rag/src/local_rag/response.json

diff --git a/examples/local_rag/configs b/examples/local_rag/configs
deleted file mode 120000
index 467f0fb12..000000000
--- a/examples/local_rag/configs
+++ /dev/null
@@ -1 +0,0 @@
-/Users/nhennouni/Desktop/NeMo-Agent-Toolkit/examples/local_rag/src/local_rag/configs
\ No newline at end of file
diff --git a/examples/local_rag/data b/examples/local_rag/data
deleted file mode 120000
index 32225d4f0..000000000
--- a/examples/local_rag/data
+++ /dev/null
@@ -1 +0,0 @@
-/Users/nhennouni/Desktop/NeMo-Agent-Toolkit/examples/local_rag/src/local_rag/data
\ No newline at end of file
diff --git a/examples/local_rag/pyproject.toml b/examples/local_rag/pyproject.toml
deleted file mode 100644
index 6177599d0..000000000
--- a/examples/local_rag/pyproject.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-[build-system]
-build-backend = "setuptools.build_meta"
-requires = ["setuptools >= 64", "setuptools-scm>=8"]
-
-[tool.setuptools_scm]
-# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
-# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
-git_describe_command = "git describe --long --first-parent"
-root = "../.."
-
-[project]
-name = "local_rag"
-dynamic = ["version"]
-dependencies = [
-  "nvidia-nat[langchain]~=1.3",
-]
-requires-python = ">=3.11,<3.14"
-description = "Custom NeMo Agent Toolkit Workflow"
-classifiers = ["Programming Language :: Python"]
-
-[tool.uv.sources]
-nvidia-nat = { path = "../..", editable = true }
-
-[project.entry-points.'nat.components']
-local_rag = "local_rag.register"
\ No newline at end of file
diff --git a/examples/local_rag/src/local_rag/__init__.py b/examples/local_rag/src/local_rag/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/local_rag/src/local_rag/configs/config.yml b/examples/local_rag/src/local_rag/configs/config.yml
deleted file mode 100644
index cfef8a6f6..000000000
--- a/examples/local_rag/src/local_rag/configs/config.yml
+++ /dev/null
@@ -1,25 +0,0 @@
-general:
-  use_uvloop: true
-
-
-functions:
-  local_rag_tool:
-    _type: local_rag
-    base_url: "https://10.184.173.138/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/search"
-    max_records: 5
-    #description: Retrieve documents given the input query
-
-llms:
-  nim_llm:
-    _type: nim
-    model_name: meta/llama-3.3-70b-instruct
-    temperature: 0
-    max_tokens: 4096
-    top_p: 1
-
-workflow:
-  _type: tool_calling_agent
-  tool_names:
-   - local_rag_tool
-  llm_name: nim_llm
-  verbose: true
diff --git a/examples/local_rag/src/local_rag/local_rag_function.py b/examples/local_rag/src/local_rag/local_rag_function.py
deleted file mode 100644
index a5c8ce20a..000000000
--- a/examples/local_rag/src/local_rag/local_rag_function.py
+++ /dev/null
@@ -1,87 +0,0 @@
-import logging
-
-import json
-from urllib.parse import quote
-
-from pydantic import Field
-
-from nat.builder.builder import Builder
-from nat.builder.function_info import FunctionInfo
-from nat.cli.register_workflow import register_function
-from nat.data_models.function import FunctionBaseConfig
-
-
-logger = logging.getLogger(__name__)
-
-
-class LocalRagFunctionConfig(FunctionBaseConfig, name="local_rag"):
-    """
-    NAT function template. Please update the description.
-    """
-    
-    base_url: str = Field(description="Local / Custom RAG URL")
-    #prompt: str = Field(default="Hello", description="The prompt")
-    max_records: int = Field(default="1", description="Maximum number of records to be retrieved")
-
-
-@register_function(config_type=LocalRagFunctionConfig)
-async def local_rag_function(
-    config: LocalRagFunctionConfig, builder: Builder
-):
-    import httpx 
-    async with httpx.AsyncClient(verify=False, headers={
-                "accept": "application/json", "Content-Type": "application/json"
-        }) as client:
-
-            async def _response_fn(query: str) -> str:
-                """
-                This tool retrieve relevant context for the given question
-                """
-                logger.info("Your query is %s", query)
-
-                # configure params for RAG endpoint and doc search
-                url = f"{config.base_url}"
-                # payload = {"prompt": quote(query, safe=""), "max_records": config.max_records}
-                payload = {"prompt": query, "max_records": config.max_records}
-
-                # send configured payload to running chain server
-                logger.debug("Sending request to the RAG endpoint %s", url)
-
-                
-                url_encoded_prompt = quote(query, safe="")
-                request =  f"{url}?prompt={url_encoded_prompt}&max_records={config.max_records}"
-
-                logger.info("Your URL is %s", request)
-
-                # response = await client.get(url, params=payload)
-                response = await client.get(request)
-
-                response.raise_for_status()
-                results = response.json()
-
-                logger.info("The results are %s", results)
-
-                if len(results["records"]) == 0:
-                    return ""
-
-                # parse docs from LangChain/LangGraph Document object to string
-                parsed_docs = []
-
-                # iterate over results and store parsed content
-
-                num_records = results["num_records"]
-                records = results["records"]
-                for i in range(num_records):
-                    link = records[i]["_links"]["self"]["href"]
-                    content = records[i]["chunk"]
-                    parsed_document = f'<Document"/> link={link}\n"{content}\n</Document>'
-                    parsed_docs.append(parsed_document)
-
-                # combine parsed documents into a single string
-                internal_search_docs = "\n\n---\n\n".join(parsed_docs)
-                return internal_search_docs
-
-            yield FunctionInfo.from_fn(
-                _response_fn,
-                description=("This tool retrieves relevant documents for a given user query."
-                            "This will return relevant documents from the selected collection."))
\ No newline at end of file
diff --git a/examples/local_rag/src/local_rag/register.py b/examples/local_rag/src/local_rag/register.py
deleted file mode 100644
index 565b9d42e..000000000
--- a/examples/local_rag/src/local_rag/register.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# flake8: noqa
-
-# Import any tools which need to be automatically registered here
-from local_rag import local_rag_function
\ No newline at end of file
diff --git a/examples/local_rag/src/local_rag/response.json b/examples/local_rag/src/local_rag/response.json
deleted file mode 100644
index 3f5dc0239..000000000
--- a/examples/local_rag/src/local_rag/response.json
+++ /dev/null
@@ -1,55 +0,0 @@
-{
-    "records": [
-        {
-            "chunk": "were addressed promptly, often within a few hours of contacting support.\r\n**Support responsiveness:** 5/5 ⭐⭐⭐⭐⭐ – The team was always available and provided clear, step-by-step guidance.\r\n**Technical expertise:** 5/5 ⭐⭐⭐⭐⭐ – The support staff had in-depth knowledge of the product and could resolve issues efficiently.\r\n**Installation assistance:** 5/5 ⭐⭐⭐⭐⭐ – The initial setup was straightforward, but the support team was always available for additional help if needed.\r\nThere was one instance in early July 2024 where a minor configuration issue arose, but the support team resolved it within a day, minimizing any disruption to our operations. I would highly recommend the support and service team to anyone considering the DataMaster Pro_storage. Their level of dedication and professionalism has been a key factor in my overall satisfaction with the product.\r\nHow easy was it to use the product?\r\nThe DataMaster Pro_storage was remarkably easy to use right out of the box. The setup process was straightforward, and I was able to get it up and running within a few hours of unboxing.\r\n**Ease of installation:** 5/5 ⭐⭐⭐⭐⭐ – Took less than 2 hours with minimal configuration.\r\n**User interface clarity:** 4.5/5 ⭐⭐⭐⭐⭐ – Very user-friendly with clear instructions and helpful tooltips.\r\n**Learning curve:** Very low – I didn’t need any training or external resources to begin using it effectively.\r\nOne thing I particularly appreciated was the built-in help documentation and online support resources. These were extremely useful",
-            "_links": {
-                "self": {
-                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/17e8cb8e-dfba-4a1e-be85-7b260426e8d8"
-                }
-            },
-            "index": 1,
-            "score": 0.45823032
-        },
-        {
-            "chunk": "**: Any inquiries or issues I had were resolved within minutes, often\nthrough live chat or phone support. The team was not only quick to respond but also provided clear\nand detailed solutions.\n**Training and Resources**: The company provided comprehensive documentation, video tutorials,\nand a dedicated support portal, which I found extremely helpful in maximizing the product's\npotential.\n**Follow-up and Maintenance**: I have not encountered any major issues with the **DataMaster\n\n\nPro_storage**, but when I did have a minor configuration query, the support team followed up to **Overall Satisfaction with Support**: Overall satisfaction with support: 5/5 ?????\nI have not experienced any significant delays or miscommunication with the support team, and their\nlevel of professionalism has been consistently high. I would not hesitate to recommend their service\nto others.\nHow does the product meet your expectations for usability and user interface?\nThe DataMaster Pro_storage has met and even exceeded my expectations in terms of usability and\nuser interface. The interface is intuitive and well-designed, making it easy to navigate even for\nsomeone who is not highly technical.\n**User Interface Design**: The interface is clean, with clearly labeled sections and a logical flow. I\nfound it easy to access the main features without any confusion.\n**Usability Features**: The product includes helpful tooltips, context-sensitive help, and a\nwell-organized dashboard that provides at-a-glance information on storage usage",
-            "_links": {
-                "self": {
-                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/c8b70ada-e564-4090-91a2-8ab87a32aca3"
-                }
-            },
-            "index": 2,
-            "score": 0.47251424
-        },
-        {
-            "chunk": ", performance\nmetrics, and system health.\n**Customization Options**: I appreciate the ability to customize the interface according to my\npreferences, such as changing the layout or adjusting notification settings.\n**Overall Satisfaction with UI/UX**: Overall satisfaction with usability and user interface: 5/5 ?????\nI have been using the DataMaster Pro_storage since early 2024, and I have not encountered any\nsignificant usability issues. The product has been a pleasure to work with, and I believe it sets a high\nstandard for storage solutions in terms of user experience.\nAddress: Jalan Dr. Djunjunan No. 5, Palu, SU 68210\nAddress: Jalan Dr. Djunjunan No. 5, Palu, SU 68210\nDate: 19/07/2025 15:52:25\n\n\n",
-            "_links": {
-                "self": {
-                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/c8b70ada-e564-4090-91a2-8ab87a32aca3"
-                }
-            },
-            "index": 3,
-            "score": 0.49985614
-        },
-        {
-            "chunk": "responsiveness has been\nextremely\nlow, scoring only 1.5/5 ??. It has taken over a week to receive a response on multiple\noccasions,\nwhich has significantly delayed resolution of critical issues.\n**Ease of installation:** The initial setup was straightforward and took less than 2\nhours, scoring 4/5\n????. However, this positive experience was overshadowed by the subsequent issues\nencountered\npost-deployment.\n**Overall satisfaction:** Overall satisfaction with the DataMaster X1_storage has been\nextremely\nlow, scoring 1.8/5 ??. The product has failed to deliver on its promises, especially in\n\n\nterms of\nperformance, reliability, and support.\n[Generated content for section: Customer Contact Info]\nDate: 20/07/2025 12:19:06\n\n\n",
-            "_links": {
-                "self": {
-                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/c043269b-c548-4f3e-a771-28b18fe5f810"
-                }
-            },
-            "index": 4,
-            "score": 0.5283787
-        },
-        {
-            "chunk": "potential damage to the hardware.\r\n**Support During Implementation:** The support team was generally available during the implementation phase, but their response times were slow at times. This made it difficult to address certain issues promptly.\r\n**Overall Satisfaction with Deployment:** Overall satisfaction with the deployment and implementation: 3.8/5 ⭐⭐⭐⭐. While the system performed well in many aspects, there were areas where the process could have been more efficient and user-friendly.\r\nQuantitative feedback rating - on a scale of 1-5)\r\nThe DataMaster Pro_storage has met some of my expectations but has also shown several critical shortcomings that have impacted my overall satisfaction. While it performs well under moderate loads and has a user-friendly interface, the heat management issues and file corruption problems have been persistent and concerning. The system reached temperatures as high as 55°C during sustained use, which is outside the recommended operating range of 20°C to 40°C. This has raised concerns about long-term reliability and potential hardware damage. Additionally, occasional file corruption has occurred, requiring manual recovery efforts and increasing the risk of data loss. Despite these issues, the initial setup was relatively straightforward, and the support team is knowledgeable, though response times are often slow. Overall, I would rate the DataMaster Pro_storage a 3/5, as it has some redeeming qualities but also several critical issues that need to be addressed.\r\nQuantitative",
-            "_links": {
-                "self": {
-                    "href": "/api/data-engine/workspaces/f44f314c-2c9b-4a83-9050-3fc72d896ddf/data-collections/580bd688-9425-4140-87fc-586b89eefc99/entities/2a641eaa-b395-4e05-b062-71c2a7307b19"
-                }
-            },
-            "index": 5,
-            "score": 0.53004134
-        }
-    ],
-    "num_records": 5
-}
\ No newline at end of file

From 965c1fa2669b02911555f18eb711d7f167446834 Mon Sep 17 00:00:00 2001
From: NarimaneH <nhennouni@nvidia.com>
Date: Thu, 25 Sep 2025 00:56:14 +0000
Subject: [PATCH 5/6] modified READme and added dataset

---
 examples/RAG/library_rag/.env_library | 127 ++++++++
 examples/RAG/library_rag/README.md    | 419 +++++++++++++++++---------
 2 files changed, 404 insertions(+), 142 deletions(-)
 create mode 100644 examples/RAG/library_rag/.env_library

diff --git a/examples/RAG/library_rag/.env_library b/examples/RAG/library_rag/.env_library
new file mode 100644
index 000000000..48c27bff7
--- /dev/null
+++ b/examples/RAG/library_rag/.env_library
@@ -0,0 +1,127 @@
+export NVIDIA_API_KEY=${NGC_API_KEY}
+
+# Ingestor server specific configurations
+# === Vector DB specific configurations ===
+export APP_VECTORSTORE_URL=http://localhost:19530
+export APP_VECTORSTORE_NAME=milvus
+export APP_VECTORSTORE_INDEXTYPE=GPU_CAGRA
+export APP_VECTORSTORE_SEARCHTYPE=dense
+export APP_VECTORSTORE_CONSISTENCYLEVEL=Strong
+export APP_VECTORSTORE_ENABLEGPUINDEX=True
+export APP_VECTORSTORE_ENABLEGPUSEARCH=True
+export COLLECTION_NAME=test_native
+
+# === MINIO specific configurations ===
+export MINIO_ENDPOINT=localhost:9010
+export MINIO_ACCESSKEY=minioadmin
+export MINIO_SECRETKEY=minioadmin
+
+# === Embedding Model specific configurations ===
+export APP_EMBEDDINGS_SERVERURL=localhost:9080
+export APP_EMBEDDINGS_MODELNAME=nvidia/llama-3.2-nv-embedqa-1b-v2
+export APP_EMBEDDINGS_DIMENSIONS=2048
+
+# === NV-Ingest Connection Configurations ===
+export APP_NVINGEST_MESSAGECLIENTHOSTNAME=localhost
+export APP_NVINGEST_MESSAGECLIENTPORT=7670
+
+# === NV-Ingest Extract Configurations ===
+export APP_NVINGEST_EXTRACTTEXT=True
+export APP_NVINGEST_EXTRACTINFOGRAPHICS=False
+export APP_NVINGEST_EXTRACTTABLES=True
+export APP_NVINGEST_EXTRACTCHARTS=True
+export APP_NVINGEST_EXTRACTIMAGES=False
+export APP_NVINGEST_PDFEXTRACTMETHOD=None
+export APP_NVINGEST_TEXTDEPTH=page
+
+# === NV-Ingest Splitting Configurations ===
+export APP_NVINGEST_CHUNKSIZE=512
+export APP_NVINGEST_CHUNKOVERLAP=150
+export APP_NVINGEST_ENABLEPDFSPLITTER=True
+
+# === NV-Ingest Caption Model configurations ===
+export APP_NVINGEST_CAPTIONMODELNAME=nvidia/llama-3.1-nemotron-nano-vl-8b-v1
+export APP_NVINGEST_CAPTIONENDPOINTURL=http://localhost:1977/v1/chat/completions
+
+# Choose whether to store the extracted content in the vector store for citation support
+export ENABLE_CITATIONS=True
+
+# Log level for server
+export LOGLEVEL=INFO
+
+# [Optional] Redis configuration for task status and result storage
+export REDIS_HOST=localhost
+export REDIS_PORT=6379
+export REDIS_DB=0
+
+# Bulk upload to MinIO
+export ENABLE_MINIO_BULK_UPLOAD=True
+
+# --- Additional variables from rag-server ---
+export EXAMPLE_PATH=./nvidia_rag/rag_server
+
+# === Vector DB additional configs ===
+export APP_RETRIEVER_SCORETHRESHOLD=0.25
+export VECTOR_DB_TOPK=100
+
+# === LLM Model specific configurations ===
+export APP_LLM_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1"
+export APP_LLM_SERVERURL=localhost:8999
+
+# === Query Rewriter Model specific configurations ===
+export APP_QUERYREWRITER_MODELNAME="meta/llama-3.1-8b-instruct"
+export APP_QUERYREWRITER_SERVERURL=localhost:8991
+
+# === Reranking Model specific configurations ===
+export APP_RANKING_SERVERURL=localhost:1976
+export APP_RANKING_MODELNAME="nvidia/llama-3.2-nv-rerankqa-1b-v2"
+export ENABLE_RERANKER=True
+
+# === VLM Model specific configurations ===
+export ENABLE_VLM_INFERENCE=False
+export APP_VLM_SERVERURL=http://localhost:1977/v1
+export APP_VLM_MODELNAME="nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
+
+# Number of document chunks to insert in LLM prompt (when reranker enabled)
+export APP_RETRIEVER_TOPK=10
+
+# === Conversation and Query Handling ===
+export ENABLE_MULTITURN=True
+export ENABLE_QUERYREWRITER=False
+
+# === Guardrails ===
+export ENABLE_GUARDRAILS=False
+export NEMO_GUARDRAILS_URL=localhost:7331
+
+# === Conversation History ===
+export CONVERSATION_HISTORY=5
+
+# === Tracing ===
+export APP_TRACING_ENABLED=False
+export APP_TRACING_OTLPHTTPENDPOINT=http://localhost:4318/v1/traces
+export APP_TRACING_OTLPGRPCENDPOINT=grpc://localhost:4317
+
+# === Source Metadata and Filtering ===
+export ENABLE_SOURCE_METADATA=true
+export FILTER_THINK_TOKENS=true
+export ENABLE_NEMOTRON_THINKING=false
+
+# === Reflection (context relevance/groundedness checking) ===
+export ENABLE_REFLECTION=false
+export MAX_REFLECTION_LOOP=3
+export CONTEXT_RELEVANCE_THRESHOLD=1
+export RESPONSE_GROUNDEDNESS_THRESHOLD=1
+export REFLECTION_LLM="mistralai/mixtral-8x22b-instruct-v0.1"
+export REFLECTION_LLM_SERVERURL=localhost:8998
+
+# === Document Summary Model specific configurations ===
+export SUMMARY_LLM="nvidia/llama-3.3-nemotron-super-49b-v1"
+export SUMMARY_LLM_SERVERURL=localhost:8999
+export SUMMARY_LLM_MAX_CHUNK_LENGTH=50000
+
+# === Temporary directory ===
+export TEMP_DIR=./tmp-data/
+
+# === Prompt configuration ===
+# Change this to the absolute path of the prompt.yaml file you want to use
+# export PROMPT_CONFIG_FILE=src/nvidia_rag/rag_server/prompt.yaml
\ No newline at end of file
diff --git a/examples/RAG/library_rag/README.md b/examples/RAG/library_rag/README.md
index c961f9652..00d80e162 100644
--- a/examples/RAG/library_rag/README.md
+++ b/examples/RAG/library_rag/README.md
@@ -1,183 +1,329 @@
 # NVIDIA RAG Python Package Usage Guide
 
 This guide demonstrates how to use a NAT agent with the NVIDIA RAG Python client as a tool.
-## Table of Contents
+# Get Started With NVIDIA RAG Blueprint
 
-- [Installation](#installation)
-- [Setup Dependencies](#setup-dependencies)
-- [API Usage Examples](#api-usage-examples)
-- [Collection Management](#collection-management)
-- [Document Operations](#document-operations)
-- [RAG Queries](#rag-queries)
-- [Search Operations](#search-operations)
-- [Advanced Features](#advanced-features)
-- [Cleanup Operations](#cleanup-operations)
+Use the following documentation to get started with the NVIDIA RAG Blueprint.
 
-## Installation
+- [Obtain an API Key](#obtain-an-api-key)
+- [Interact using native python APIs](#interact-using-native-python-apis)
+- [Deploy With Docker Compose](#deploy-with-docker-compose)
+- [Deploy With Helm Chart](#deploy-with-helm-chart)
+- [Data Ingestion](#data-ingestion)
 
-> **Note**: Python version **3.12 or higher** is supported.
+
+## Obtain an API Key
+
+You need to generate an API key
+to access NIM services, to access models hosted in the NVIDIA API Catalog, and to download models on-premises.
+For more information, refer to [NGC API Keys](https://docs.nvidia.com/ngc/gpu-cloud/ngc-private-registry-user-guide/index.html#ngc-api-keys).
+
+To generate an API key, use the following procedure.
+
+1. Go to https://org.ngc.nvidia.com/setup/api-keys.
+2. Click **+ Generate Personal Key**.
+3. Enter a **Key Name**.
+4. For **Services Included**, select **NGC Catalog** and **Public API Endpoints**.
+5. Click **Generate Personal Key**.
+
+After you generate your key, export your key as an environment variable by using the following code.
+
+```bash
+export NGC_API_KEY="<your-ngc-api-key>"
+```
+
+
+
+## Deploy With Docker Compose
+
+Use these procedures to deploy with Docker Compose for a single node deployment. Alternatively, you can [Deploy With Helm Chart](#deploy-with-helm-chart) to deploy on a Kubernetes cluster.
+
+Developers need to deploy ingestion services and rag services using seperate dedicated docker compose files.
+For both retrieval and ingestion services, by default all the models are deployed on-prem. Follow relevant section below as per your requirement and hardware availability.
+
+- Start the Microservices
+  - [Using on-prem models](#start-using-on-prem-models)
+  - [Using NVIDIA hosted models](#start-using-nvidia-hosted-models)
 
 ### Prerequisites
 
-1. **Install Python >= 3.12 and development headers:**
+1. Install Docker Engine. For more information, see [Ubuntu](https://docs.docker.com/engine/install/ubuntu/).
+
+2. Install Docker Compose. For more information, see [install the Compose plugin](https://docs.docker.com/compose/install/linux/).
+
+   a. Ensure the Docker Compose plugin version is 2.29.1 or later.
+
+   b. After you get the Docker Compose plugin installed, run `docker compose version` to confirm.
+
+3. To pull images required by the blueprint from NGC, you must first authenticate Docker with nvcr.io. Use the NGC API Key you created in [Obtain an API Key](#obtain-an-api-key).
+
    ```bash
-   sudo add-apt-repository ppa:deadsnakes/ppa
-   sudo apt update
-   sudo apt install python3.12
-   sudo apt-get install python3.12-dev
+   export NGC_API_KEY="nvapi-..."
+   echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
    ```
 
-2. **Install uv:**
-   Follow instructions from [https://docs.astral.sh/uv/getting-started/installation/](https://docs.astral.sh/uv/getting-started/installation/)
+4. Some containers with are enabled with GPU acceleration, such as Milvus and NVIDIA NIMS deployed on-prem. To configure Docker for GPU-accelerated containers, [install](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), the NVIDIA Container Toolkit
+
+5. Ensure you meet [the hardware requirements if you are deploying models on-prem](./support-matrix.md).
+
+6. Change directory to the library rag example:  ```cd examples/RAG/library_rag```
+
+
+### Start using on-prem models
+
+Use the following procedure to start all containers needed for this blueprint. This launches the ingestion services followed by the rag services and all of its dependent NIMs on-prem.
+
+1. Fulfill the [prerequisites](#prerequisites). Ensure you meet [the hardware requirements](./support-matrix.md).
+
+2. Create a directory to cache the models and export the path to the cache as an environment variable.
 
-3. **Create and activate virtual environment:**
    ```bash
-   # Create virtual environment
-   uv venv --python=python3.12
-   
-   # Activate virtual environment
-   source .venv/bin/activate
+   mkdir -p ~/.cache/model-cache
+   export MODEL_DIRECTORY=~/.cache/model-cache
    ```
 
-### Installation 
+3. Export all the required environment variables to use on-prem models. Ensure the section `Endpoints for using cloud NIMs` is commented in this file.
 
-```bash
-uv pip install nvidia-rag[all]
-```
+   ```bash
+   source deploy/.env 
+   ```
 
-### Verify Installation
+4. Start all required NIMs.
 
-Check that the package is installed in your virtual environment:
+   Before running the command please ensure the GPU allocation is done appropriately in the deploy/compose/.env. You might need to override them
+   for the hardware you are deploying this blueprint on. The default assumes you are deploying this on a 2XH100 environment.
 
-```bash
-uv pip show nvidia_rag | grep Location
-```
+   ```bash
+   USERID=$(id -u) docker compose -f deploy/nims.yaml up -d
+   ```
 
-The location should be inside your virtual environment at: `<workspace_path>/rag/.venv/lib/python3.12/site-packages`
+   - Wait till the `nemoretriever-ranking-ms`, `nemoretriever-embedding-ms` and `nim-llm-ms`  NIMs are in healthy state before proceeding further.
 
+   - The nemo LLM service may take upto 30 mins to start for the first time as the model is downloaded and cached. The models are downloaded and cached in the path specified by `MODEL_DIRECTORY`. Subsequent deployments will take 2-5 mins to startup based on the GPU profile.
 
-## Setup Dependencies
+   - The default configuration allocates one GPU (GPU ID 1) to `nim-llm-ms` which defaults to minimum GPUs needed for H100 or B200 profile. If you are deploying the solution on A100, please allocate 2 available GPUs by exporting below env variable before launching:
+     ```bash
+     export LLM_MS_GPU_ID=1,2
+     ```
 
-### Prerequisites
+   - To start just the NIMs specific to rag or ingestion add the `--profile rag` or `--profile ingest` flag to the command.
 
-Fulfill the [prerequisites](../docs/quickstart.md#prerequisites) to setup Docker on your system.
+   - Ensure all the below are running before proceeding further
 
-### 1. Configure API Key
+     ```bash
+     watch -n 2 'docker ps --format "table {{.Names}}\t{{.Status}}"'
+     ```
 
-First, obtain an NGC API key by following the steps [here](../docs/quickstart.md#obtain-an-api-key).
+     ```output
+        NAMES                                   STATUS
 
-```python
-import os
-from getpass import getpass
-from dotenv import load_dotenv
-
-# Set your NGC API key
-if not os.environ.get("NGC_API_KEY", "").startswith("nvapi-"):
-    candidate_api_key = getpass("NVAPI Key (starts with nvapi-): ")
-    assert candidate_api_key.startswith("nvapi-"), f"{candidate_api_key[:5]}... is not a valid key"
-    os.environ["NGC_API_KEY"] = candidate_api_key
-```
+        nemoretriever-ranking-ms                Up 14 minutes (healthy)
+        compose-page-elements-1                 Up 14 minutes
+        compose-paddle-1                        Up 14 minutes
+        compose-graphic-elements-1              Up 14 minutes
+        compose-table-structure-1               Up 14 minutes
+        nemoretriever-embedding-ms              Up 14 minutes (healthy)
+        nim-llm-ms                              Up 14 minutes (healthy)
+     ```
 
-### 2. Docker Login
 
-```bash
-echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
-```
+5. Start the vector db containers from the repo root.
+   ```bash
+   docker compose -f deploy/vectordb.yaml up -d
+   ```
 
-### 3. Load Default Configuration
+   [!TIP]
+   By default GPU accelerated Milvus DB is deployed. You can choose the GPU ID to be allocated using below env variable.
+   ```bash
+   VECTORSTORE_GPU_DEVICE_ID=0
+   ```
 
-```python
-load_dotenv(dotenv_path=".env_library", override=True)
-```
+   For B200 and A100 GPUs, use Milvus CPU indexing due to known retrieval accuracy issues with Milvus GPU indexing and search. Export following environment variables to disable Milvus GPU ingexing and search.
+   ```bash
+   export APP_VECTORSTORE_ENABLEGPUSEARCH=False
+   export APP_VECTORSTORE_ENABLEGPUINDEX=False
+   ```
 
-> **💡 Tip:** Override default configurations using `os.environ` in your code. Reimport the `nvidia_rag` package and restart the Nvidia Ingest runtime for changes to take effect.
+6. Start the ingestion containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
 
-### 4. Setup Milvus Vector Database
+   ```bash
+   docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
+   ```
 
-Configure GPU device (default uses GPU indexing):
+7. Start the rag containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
 
-```python
-os.environ["VECTORSTORE_GPU_DEVICE_ID"] = "0"
-```
+   ```bash
+   docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d
+   ```
 
-> **Note:** For CPU-only Milvus, follow instructions in [milvus-configuration.md](../docs/milvus-configuration.md).
+   You can check the status of the rag-server and its dependencies by issuing this curl command
+   ```bash
+   curl -X 'GET' 'http://workstation_ip:8081/v1/health?check_dependencies=true' -H 'accept: application/json'
+   ```
 
-Start Milvus:
-```bash
-docker compose -f ../deploy/compose/vectordb.yaml up -d
-```
+8. Confirm all the below mentioned containers are running.
 
-### 5. Setup NIMs (Neural Inference Microservices)
+   ```bash
+   docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+   ```
 
-Choose either on-premises or cloud-hosted models:
+   *Example Output*
+
+   ```output
+   NAMES                                   STATUS
+   compose-nv-ingest-ms-runtime-1          Up 5 minutes (healthy)
+   ingestor-server                         Up 5 minutes
+   compose-redis-1                         Up 5 minutes
+   rag-playground                          Up 9 minutes
+   rag-server                              Up 9 minutes
+   milvus-standalone                       Up 36 minutes
+   milvus-minio                            Up 35 minutes (healthy)
+   milvus-etcd                             Up 35 minutes (healthy)
+   nemoretriever-ranking-ms                Up 38 minutes (healthy)
+   compose-page-elements-1                 Up 38 minutes
+   compose-paddle-1                        Up 38 minutes
+   compose-graphic-elements-1              Up 38 minutes
+   compose-table-structure-1               Up 38 minutes
+   nemoretriever-embedding-ms              Up 38 minutes (healthy)
+   nim-llm-ms                              Up 38 minutes (healthy)
+   ```
 
-#### Option 1: On-Premises Models
+9.  Open a web browser and access `http://localhost:8090` to use the RAG Playground. You can use the upload tab to ingest files into the server or follow [the notebooks](../notebooks/) to understand the API usage.
 
-Ensure you meet the [hardware requirements](../README.md#hardware-requirements). Default configuration requires 2xH100.
+10. To stop all running services, after making some [customizations](#next-steps)
+    ```bash
+    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down
+    docker compose -f deploy/compose/nims.yaml down
+    docker compose -f deploy/compose/docker-compose-rag-server.yaml down
+    docker compose -f deploy/compose/vectordb.yaml down
+    ```
 
-```bash
-# Create model cache directory
-mkdir -p ~/.cache/model-cache
-```
+**📝 Notes:**
 
-```python
-# Configure model directory
-os.environ["MODEL_DIRECTORY"] = os.path.expanduser("~/.cache/model-cache")
-
-# Configure GPU IDs for microservices
-os.environ["EMBEDDING_MS_GPU_ID"] = "0"
-os.environ["RANKING_MS_GPU_ID"] = "0" 
-os.environ["YOLOX_MS_GPU_ID"] = "0"
-os.environ["YOLOX_GRAPHICS_MS_GPU_ID"] = "0"
-os.environ["YOLOX_TABLE_MS_GPU_ID"] = "0"
-os.environ["OCR_MS_GPU_ID"] = "0"
-os.environ["LLM_MS_GPU_ID"] = "1"
-```
+1. A single NVIDIA A100-80GB or H100-80GB, B200 GPU can be used to start non-LLM NIMs (nemoretriever-embedding-ms, nemoretriever-ranking-ms, and ingestion services like page-elements, paddle, graphic-elements, and table-structure) for ingestion and RAG workflows. You can control which GPU is used for each service by setting these environment variables in `deploy/compose/.env` file before launching:
+   ```bash
+   EMBEDDING_MS_GPU_ID=0
+   RANKING_MS_GPU_ID=0
+   YOLOX_MS_GPU_ID=0
+   YOLOX_GRAPHICS_MS_GPU_ID=0
+   YOLOX_TABLE_MS_GPU_ID=0
+   PADDLE_MS_GPU_ID=0
+   ```
 
-Deploy NIMs (may take time for model downloads):
-```bash
-USERID=$(id -u) docker compose -f ../deploy/compose/nims.yaml up -d
-```
+2. If the NIMs are deployed in a different workstation or outside the nvidia-rag docker network on the same system, replace the host address of the below URLs with workstation IPs.
 
-Monitor container status:
-```bash
-docker ps
-```
+   ```bash
+   APP_EMBEDDINGS_SERVERURL="workstation_ip:8000"
+   APP_LLM_SERVERURL="workstation_ip:8000"
+   APP_RANKING_SERVERURL="workstation_ip:8000"
+   PADDLE_GRPC_ENDPOINT="workstation_ip:8001"
+   YOLOX_GRPC_ENDPOINT="workstation_ip:8001"
+   YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT="workstation_ip:8001"
+   YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT="workstation_ip:8001"
+   ```
 
-Ensure all containers are running and healthy:
-- nemoretriever-ranking-ms (healthy)
-- compose-page-elements-1
-- compose-paddle-1  
-- compose-graphic-elements-1
-- compose-table-structure-1
-- nemoretriever-embedding-ms (healthy)
-- nim-llm-ms (healthy)
+3. Due to react limitations, any changes made to below environment variables will require developers to rebuilt the rag containers. This will be fixed in a future release.
+
+   ```output
+   # Model name for LLM
+   NEXT_PUBLIC_MODEL_NAME: ${APP_LLM_MODELNAME:-meta/llama-3.1-70b-instruct}
+   # Model name for embeddings
+   NEXT_PUBLIC_EMBEDDING_MODEL: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
+   # Model name for reranking
+   NEXT_PUBLIC_RERANKER_MODEL: ${APP_RANKING_MODELNAME:-nvidia/llama-3.2-nv-rerankqa-1b-v2}
+   # URL for rag server container
+   NEXT_PUBLIC_CHAT_BASE_URL: "http://rag-server:8081/v1"
+   # URL for ingestor container
+   NEXT_PUBLIC_VDB_BASE_URL: "http://ingestor-server:8082/v1"
+   ```
 
-#### Option 2: NVIDIA Cloud Models
 
-```python
-os.environ["APP_LLM_MODELNAME"] = "nvidia/llama-3_3-nemotron-super-49b-v1_5"
-os.environ["APP_EMBEDDINGS_MODELNAME"] = "nvidia/llama-3.2-nv-embedqa-1b-v2"
-os.environ["APP_RANKING_MODELNAME"] = "nvidia/llama-3.2-nv-rerankqa-1b-v2"
-os.environ["APP_EMBEDDINGS_SERVERURL"] = ""
-os.environ["APP_LLM_SERVERURL"] = ""
-os.environ["APP_RANKING_SERVERURL"] = "https://ai.api.nvidia.com/v1/retrieval/nvidia/llama-3_2-nv-rerankqa-1b-v2/reranking/v1"
-os.environ["EMBEDDING_NIM_ENDPOINT"] = "https://integrate.api.nvidia.com/v1"
-os.environ["OCR_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/baidu/paddleocr"
-os.environ["OCR_INFER_PROTOCOL"] = "http"
-os.environ["YOLOX_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2"
-os.environ["YOLOX_INFER_PROTOCOL"] = "http"
-os.environ["YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1"
-os.environ["YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL"] = "http"
-os.environ["YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT"] = "https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1"
-os.environ["YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL"] = "http"
-```
+### Start using nvidia hosted models
 
-### 6. Setup NVIDIA Ingest Runtime
+1. Verify that you meet the [prerequisites](#prerequisites).
+
+2. Open `deploy/.env` and uncomment the section `Endpoints for using cloud NIMs`.
+   Then set the environment variables by executing below command.
+   ```bash
+   source deploy/.env
+   ```
 
-```bash
-docker compose -f ../deploy/compose/docker-compose-ingestor-server.yaml up nv-ingest-ms-runtime redis -d
-```
+
+   **📝 Note:**
+   When using NVIDIA hosted endpoints, you may encounter rate limiting with larger file ingestions (>10 files).
+
+3. Start the vector db containers from the repo root.
+   ```bash
+   docker compose -f deploy/vectordb.yaml up -d
+   ```
+   [!NOTE]
+   If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](./milvus-configuration.md).
+
+   [!TIP]
+   For B200 and A100 GPUs, use Milvus CPU indexing due to known retrieval accuracy issues with Milvus GPU indexing and search. Export following environment variables to disable Milvus GPU ingexing and search.
+   ```bash
+   export APP_VECTORSTORE_ENABLEGPUSEARCH=False
+   export APP_VECTORSTORE_ENABLEGPUINDEX=False
+   ```
+
+4. Start the ingestion containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
+
+   ```bash
+   docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
+   ```
+
+   [!TIP]
+   You can add a `--build` argument in case you have made some code changes or have any requirement of re-building ingestion containers from source code:
+
+   ```bash
+   docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
+   ```
+
+5. Start the rag containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
+
+   ```bash
+   docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d
+   ```
+
+   [!TIP]
+   You can add a `--build` argument in case you have made some code changes or have any requirement of re-building containers from source code:
+
+   ```bash
+   docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
+   ```
+
+   You can check the status of the rag-server and its dependencies by issuing this curl command
+   ```bash
+   curl -X 'GET' 'http://workstation_ip:8081/v1/health?check_dependencies=true' -H 'accept: application/json'
+   ```
+
+6. Confirm all the below mentioned containers are running.
+
+   ```bash
+   docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
+   ```
+
+   *Example Output*
+
+   ```output
+   NAMES                                   STATUS
+   compose-nv-ingest-ms-runtime-1          Up 5 minutes (healthy)
+   ingestor-server                         Up 5 minutes
+   compose-redis-1                         Up 5 minutes
+   rag-playground                          Up 9 minutes
+   rag-server                              Up 9 minutes
+   milvus-standalone                       Up 36 minutes
+   milvus-minio                            Up 35 minutes (healthy)
+   milvus-etcd                             Up 35 minutes (healthy)
+   ```
+
+7. Open a web browser and access `http://localhost:8090` to use the RAG Playground. You can use the upload tab to ingest files into the server or follow [the notebooks](../notebooks/) to understand the API usage.
+
+8. To stop all running services, after making some [customizations](#next-steps)
+    ```bash
+    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down
+    docker compose -f deploy/compose/docker-compose-rag-server.yaml down
+    docker compose -f deploy/compose/vectordb.yaml down
 
 Open the RAG Playground at localhost:3080, create a new collection and save it. or you can use the API for that, see API Usage examples below:
 
@@ -255,21 +401,10 @@ response = await ingestor.upload_documents(
     blocking=False,
     split_options={"chunk_size": 512, "chunk_overlap": 150},
     filepaths=[
-        "../data/multimodal/woods_frost.docx",
-        "../data/multimodal/multimodal_test.pdf",
+        "examples/RAG/libraary_rag/data/cuda.txt",
     ],
     generate_summary=False,
-    # Optional: Add custom metadata
-    # custom_metadata=[
-    #     {
-    #         "filename": "multimodal_test.pdf",
-    #         "metadata": {"meta_field_1": "multimodal document 1"}
-    #     },
-    #     {
-    #         "filename": "woods_frost.docx", 
-    #         "metadata": {"meta_field_1": "multimodal document 2"}
-    #     }
-    # ]
+    
 )
 print(response)
 ```

From 6f49effd7428a82ec536aeec010fb66c1b6453e4 Mon Sep 17 00:00:00 2001
From: NarimaneH <nhennouni@nvidia.com>
Date: Fri, 26 Sep 2025 02:40:44 +0000
Subject: [PATCH 6/6] new library rag tool

---
 examples/RAG/library_rag/.env_library         | 127 -----
 examples/RAG/library_rag/README.md            | 507 ------------------
 examples/RAG/library_rag/configs/config.yml   |  31 --
 .../docker-compose-ingestor-server.yaml       | 211 --------
 .../deploy/docker-compose-rag-server.yaml     | 186 -------
 examples/RAG/library_rag/deploy/nims.yaml     | 348 ------------
 examples/RAG/library_rag/deploy/vectordb.yaml | 102 ----
 examples/RAG/library_rag/pyproject.toml       |  25 -
 .../src/library_rag/configs/config.yml        |  31 --
 .../src/library_rag/library_rag_function.py   |  82 ---
 examples/RAG/library_rag/tests/__init__.py    |   0
 examples/RAG/library_rag/tests/conftest.py    |  78 ---
 .../tests/test_configs/test_config.yml        |  28 -
 .../RAG/library_rag/tests/test_integration.py | 112 ----
 examples/rag_lib/README.md                    |  80 +++
 .../library_rag => rag_lib}/data/cuda.txt     |   0
 .../pyproject.toml                            |   9 +-
 .../src/rag_lib}/__init__.py                  |   0
 .../src/rag_lib}/configs/config.yml           |   8 +-
 .../src/rag_lib/rag_lib_function.py}          |  18 +-
 .../src/rag_lib}/register.py                  |   2 +-
 .../rag_library_mode/pyproject.toml           |  28 -
 .../src/rag_library_mode/__init__.py          |   0
 .../docker-compose-ingestor-server.yaml       | 211 --------
 .../deploy/docker-compose-rag-server.yaml     | 186 -------
 .../src/rag_library_mode/deploy/vectordb.yaml | 102 ----
 .../src/rag_library_mode/register.py          |   4 -
 .../src/rag_library_mode/__init__.py          |   0
 .../src/rag_library_mode/configs/config.yml   |  29 -
 .../docker-compose-ingestor-server.yaml       | 211 --------
 .../deploy/docker-compose-rag-server.yaml     | 186 -------
 .../src/rag_library_mode/deploy/vectordb.yaml | 102 ----
 .../rag_library_mode_function.py              |  78 ---
 .../src/rag_library_mode/register.py          |   4 -
 34 files changed, 93 insertions(+), 3033 deletions(-)
 delete mode 100644 examples/RAG/library_rag/.env_library
 delete mode 100644 examples/RAG/library_rag/README.md
 delete mode 100644 examples/RAG/library_rag/configs/config.yml
 delete mode 100644 examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml
 delete mode 100644 examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml
 delete mode 100644 examples/RAG/library_rag/deploy/nims.yaml
 delete mode 100644 examples/RAG/library_rag/deploy/vectordb.yaml
 delete mode 100644 examples/RAG/library_rag/pyproject.toml
 delete mode 100644 examples/RAG/library_rag/src/library_rag/configs/config.yml
 delete mode 100644 examples/RAG/library_rag/src/library_rag/library_rag_function.py
 delete mode 100644 examples/RAG/library_rag/tests/__init__.py
 delete mode 100644 examples/RAG/library_rag/tests/conftest.py
 delete mode 100644 examples/RAG/library_rag/tests/test_configs/test_config.yml
 delete mode 100644 examples/RAG/library_rag/tests/test_integration.py
 create mode 100644 examples/rag_lib/README.md
 rename examples/{RAG/library_rag => rag_lib}/data/cuda.txt (100%)
 rename examples/{rag_library_mode => rag_lib}/pyproject.toml (79%)
 rename examples/{RAG/library_rag/src/library_rag => rag_lib/src/rag_lib}/__init__.py (100%)
 rename examples/{rag_library_mode/rag_library_mode/src/rag_library_mode => rag_lib/src/rag_lib}/configs/config.yml (79%)
 rename examples/{rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py => rag_lib/src/rag_lib/rag_lib_function.py} (81%)
 rename examples/{RAG/library_rag/src/library_rag => rag_lib/src/rag_lib}/register.py (65%)
 delete mode 100644 examples/rag_library_mode/rag_library_mode/pyproject.toml
 delete mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/__init__.py
 delete mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
 delete mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
 delete mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
 delete mode 100644 examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/__init__.py
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/configs/config.yml
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
 delete mode 100644 examples/rag_library_mode/src/rag_library_mode/register.py

diff --git a/examples/RAG/library_rag/.env_library b/examples/RAG/library_rag/.env_library
deleted file mode 100644
index 48c27bff7..000000000
--- a/examples/RAG/library_rag/.env_library
+++ /dev/null
@@ -1,127 +0,0 @@
-export NVIDIA_API_KEY=${NGC_API_KEY}
-
-# Ingestor server specific configurations
-# === Vector DB specific configurations ===
-export APP_VECTORSTORE_URL=http://localhost:19530
-export APP_VECTORSTORE_NAME=milvus
-export APP_VECTORSTORE_INDEXTYPE=GPU_CAGRA
-export APP_VECTORSTORE_SEARCHTYPE=dense
-export APP_VECTORSTORE_CONSISTENCYLEVEL=Strong
-export APP_VECTORSTORE_ENABLEGPUINDEX=True
-export APP_VECTORSTORE_ENABLEGPUSEARCH=True
-export COLLECTION_NAME=test_native
-
-# === MINIO specific configurations ===
-export MINIO_ENDPOINT=localhost:9010
-export MINIO_ACCESSKEY=minioadmin
-export MINIO_SECRETKEY=minioadmin
-
-# === Embedding Model specific configurations ===
-export APP_EMBEDDINGS_SERVERURL=localhost:9080
-export APP_EMBEDDINGS_MODELNAME=nvidia/llama-3.2-nv-embedqa-1b-v2
-export APP_EMBEDDINGS_DIMENSIONS=2048
-
-# === NV-Ingest Connection Configurations ===
-export APP_NVINGEST_MESSAGECLIENTHOSTNAME=localhost
-export APP_NVINGEST_MESSAGECLIENTPORT=7670
-
-# === NV-Ingest Extract Configurations ===
-export APP_NVINGEST_EXTRACTTEXT=True
-export APP_NVINGEST_EXTRACTINFOGRAPHICS=False
-export APP_NVINGEST_EXTRACTTABLES=True
-export APP_NVINGEST_EXTRACTCHARTS=True
-export APP_NVINGEST_EXTRACTIMAGES=False
-export APP_NVINGEST_PDFEXTRACTMETHOD=None
-export APP_NVINGEST_TEXTDEPTH=page
-
-# === NV-Ingest Splitting Configurations ===
-export APP_NVINGEST_CHUNKSIZE=512
-export APP_NVINGEST_CHUNKOVERLAP=150
-export APP_NVINGEST_ENABLEPDFSPLITTER=True
-
-# === NV-Ingest Caption Model configurations ===
-export APP_NVINGEST_CAPTIONMODELNAME=nvidia/llama-3.1-nemotron-nano-vl-8b-v1
-export APP_NVINGEST_CAPTIONENDPOINTURL=http://localhost:1977/v1/chat/completions
-
-# Choose whether to store the extracted content in the vector store for citation support
-export ENABLE_CITATIONS=True
-
-# Log level for server
-export LOGLEVEL=INFO
-
-# [Optional] Redis configuration for task status and result storage
-export REDIS_HOST=localhost
-export REDIS_PORT=6379
-export REDIS_DB=0
-
-# Bulk upload to MinIO
-export ENABLE_MINIO_BULK_UPLOAD=True
-
-# --- Additional variables from rag-server ---
-export EXAMPLE_PATH=./nvidia_rag/rag_server
-
-# === Vector DB additional configs ===
-export APP_RETRIEVER_SCORETHRESHOLD=0.25
-export VECTOR_DB_TOPK=100
-
-# === LLM Model specific configurations ===
-export APP_LLM_MODELNAME="nvidia/llama-3.3-nemotron-super-49b-v1"
-export APP_LLM_SERVERURL=localhost:8999
-
-# === Query Rewriter Model specific configurations ===
-export APP_QUERYREWRITER_MODELNAME="meta/llama-3.1-8b-instruct"
-export APP_QUERYREWRITER_SERVERURL=localhost:8991
-
-# === Reranking Model specific configurations ===
-export APP_RANKING_SERVERURL=localhost:1976
-export APP_RANKING_MODELNAME="nvidia/llama-3.2-nv-rerankqa-1b-v2"
-export ENABLE_RERANKER=True
-
-# === VLM Model specific configurations ===
-export ENABLE_VLM_INFERENCE=False
-export APP_VLM_SERVERURL=http://localhost:1977/v1
-export APP_VLM_MODELNAME="nvidia/llama-3.1-nemotron-nano-vl-8b-v1"
-
-# Number of document chunks to insert in LLM prompt (when reranker enabled)
-export APP_RETRIEVER_TOPK=10
-
-# === Conversation and Query Handling ===
-export ENABLE_MULTITURN=True
-export ENABLE_QUERYREWRITER=False
-
-# === Guardrails ===
-export ENABLE_GUARDRAILS=False
-export NEMO_GUARDRAILS_URL=localhost:7331
-
-# === Conversation History ===
-export CONVERSATION_HISTORY=5
-
-# === Tracing ===
-export APP_TRACING_ENABLED=False
-export APP_TRACING_OTLPHTTPENDPOINT=http://localhost:4318/v1/traces
-export APP_TRACING_OTLPGRPCENDPOINT=grpc://localhost:4317
-
-# === Source Metadata and Filtering ===
-export ENABLE_SOURCE_METADATA=true
-export FILTER_THINK_TOKENS=true
-export ENABLE_NEMOTRON_THINKING=false
-
-# === Reflection (context relevance/groundedness checking) ===
-export ENABLE_REFLECTION=false
-export MAX_REFLECTION_LOOP=3
-export CONTEXT_RELEVANCE_THRESHOLD=1
-export RESPONSE_GROUNDEDNESS_THRESHOLD=1
-export REFLECTION_LLM="mistralai/mixtral-8x22b-instruct-v0.1"
-export REFLECTION_LLM_SERVERURL=localhost:8998
-
-# === Document Summary Model specific configurations ===
-export SUMMARY_LLM="nvidia/llama-3.3-nemotron-super-49b-v1"
-export SUMMARY_LLM_SERVERURL=localhost:8999
-export SUMMARY_LLM_MAX_CHUNK_LENGTH=50000
-
-# === Temporary directory ===
-export TEMP_DIR=./tmp-data/
-
-# === Prompt configuration ===
-# Change this to the absolute path of the prompt.yaml file you want to use
-# export PROMPT_CONFIG_FILE=src/nvidia_rag/rag_server/prompt.yaml
\ No newline at end of file
diff --git a/examples/RAG/library_rag/README.md b/examples/RAG/library_rag/README.md
deleted file mode 100644
index 00d80e162..000000000
--- a/examples/RAG/library_rag/README.md
+++ /dev/null
@@ -1,507 +0,0 @@
-# NVIDIA RAG Python Package Usage Guide
-
-This guide demonstrates how to use a NAT agent with the NVIDIA RAG Python client as a tool.
-# Get Started With NVIDIA RAG Blueprint
-
-Use the following documentation to get started with the NVIDIA RAG Blueprint.
-
-- [Obtain an API Key](#obtain-an-api-key)
-- [Interact using native python APIs](#interact-using-native-python-apis)
-- [Deploy With Docker Compose](#deploy-with-docker-compose)
-- [Deploy With Helm Chart](#deploy-with-helm-chart)
-- [Data Ingestion](#data-ingestion)
-
-
-## Obtain an API Key
-
-You need to generate an API key
-to access NIM services, to access models hosted in the NVIDIA API Catalog, and to download models on-premises.
-For more information, refer to [NGC API Keys](https://docs.nvidia.com/ngc/gpu-cloud/ngc-private-registry-user-guide/index.html#ngc-api-keys).
-
-To generate an API key, use the following procedure.
-
-1. Go to https://org.ngc.nvidia.com/setup/api-keys.
-2. Click **+ Generate Personal Key**.
-3. Enter a **Key Name**.
-4. For **Services Included**, select **NGC Catalog** and **Public API Endpoints**.
-5. Click **Generate Personal Key**.
-
-After you generate your key, export your key as an environment variable by using the following code.
-
-```bash
-export NGC_API_KEY="<your-ngc-api-key>"
-```
-
-
-
-## Deploy With Docker Compose
-
-Use these procedures to deploy with Docker Compose for a single node deployment. Alternatively, you can [Deploy With Helm Chart](#deploy-with-helm-chart) to deploy on a Kubernetes cluster.
-
-Developers need to deploy ingestion services and rag services using seperate dedicated docker compose files.
-For both retrieval and ingestion services, by default all the models are deployed on-prem. Follow relevant section below as per your requirement and hardware availability.
-
-- Start the Microservices
-  - [Using on-prem models](#start-using-on-prem-models)
-  - [Using NVIDIA hosted models](#start-using-nvidia-hosted-models)
-
-### Prerequisites
-
-1. Install Docker Engine. For more information, see [Ubuntu](https://docs.docker.com/engine/install/ubuntu/).
-
-2. Install Docker Compose. For more information, see [install the Compose plugin](https://docs.docker.com/compose/install/linux/).
-
-   a. Ensure the Docker Compose plugin version is 2.29.1 or later.
-
-   b. After you get the Docker Compose plugin installed, run `docker compose version` to confirm.
-
-3. To pull images required by the blueprint from NGC, you must first authenticate Docker with nvcr.io. Use the NGC API Key you created in [Obtain an API Key](#obtain-an-api-key).
-
-   ```bash
-   export NGC_API_KEY="nvapi-..."
-   echo "${NGC_API_KEY}" | docker login nvcr.io -u '$oauthtoken' --password-stdin
-   ```
-
-4. Some containers with are enabled with GPU acceleration, such as Milvus and NVIDIA NIMS deployed on-prem. To configure Docker for GPU-accelerated containers, [install](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/latest/install-guide.html), the NVIDIA Container Toolkit
-
-5. Ensure you meet [the hardware requirements if you are deploying models on-prem](./support-matrix.md).
-
-6. Change directory to the library rag example:  ```cd examples/RAG/library_rag```
-
-
-### Start using on-prem models
-
-Use the following procedure to start all containers needed for this blueprint. This launches the ingestion services followed by the rag services and all of its dependent NIMs on-prem.
-
-1. Fulfill the [prerequisites](#prerequisites). Ensure you meet [the hardware requirements](./support-matrix.md).
-
-2. Create a directory to cache the models and export the path to the cache as an environment variable.
-
-   ```bash
-   mkdir -p ~/.cache/model-cache
-   export MODEL_DIRECTORY=~/.cache/model-cache
-   ```
-
-3. Export all the required environment variables to use on-prem models. Ensure the section `Endpoints for using cloud NIMs` is commented in this file.
-
-   ```bash
-   source deploy/.env 
-   ```
-
-4. Start all required NIMs.
-
-   Before running the command please ensure the GPU allocation is done appropriately in the deploy/compose/.env. You might need to override them
-   for the hardware you are deploying this blueprint on. The default assumes you are deploying this on a 2XH100 environment.
-
-   ```bash
-   USERID=$(id -u) docker compose -f deploy/nims.yaml up -d
-   ```
-
-   - Wait till the `nemoretriever-ranking-ms`, `nemoretriever-embedding-ms` and `nim-llm-ms`  NIMs are in healthy state before proceeding further.
-
-   - The nemo LLM service may take upto 30 mins to start for the first time as the model is downloaded and cached. The models are downloaded and cached in the path specified by `MODEL_DIRECTORY`. Subsequent deployments will take 2-5 mins to startup based on the GPU profile.
-
-   - The default configuration allocates one GPU (GPU ID 1) to `nim-llm-ms` which defaults to minimum GPUs needed for H100 or B200 profile. If you are deploying the solution on A100, please allocate 2 available GPUs by exporting below env variable before launching:
-     ```bash
-     export LLM_MS_GPU_ID=1,2
-     ```
-
-   - To start just the NIMs specific to rag or ingestion add the `--profile rag` or `--profile ingest` flag to the command.
-
-   - Ensure all the below are running before proceeding further
-
-     ```bash
-     watch -n 2 'docker ps --format "table {{.Names}}\t{{.Status}}"'
-     ```
-
-     ```output
-        NAMES                                   STATUS
-
-        nemoretriever-ranking-ms                Up 14 minutes (healthy)
-        compose-page-elements-1                 Up 14 minutes
-        compose-paddle-1                        Up 14 minutes
-        compose-graphic-elements-1              Up 14 minutes
-        compose-table-structure-1               Up 14 minutes
-        nemoretriever-embedding-ms              Up 14 minutes (healthy)
-        nim-llm-ms                              Up 14 minutes (healthy)
-     ```
-
-
-5. Start the vector db containers from the repo root.
-   ```bash
-   docker compose -f deploy/vectordb.yaml up -d
-   ```
-
-   [!TIP]
-   By default GPU accelerated Milvus DB is deployed. You can choose the GPU ID to be allocated using below env variable.
-   ```bash
-   VECTORSTORE_GPU_DEVICE_ID=0
-   ```
-
-   For B200 and A100 GPUs, use Milvus CPU indexing due to known retrieval accuracy issues with Milvus GPU indexing and search. Export following environment variables to disable Milvus GPU ingexing and search.
-   ```bash
-   export APP_VECTORSTORE_ENABLEGPUSEARCH=False
-   export APP_VECTORSTORE_ENABLEGPUINDEX=False
-   ```
-
-6. Start the ingestion containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
-
-   ```bash
-   docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
-   ```
-
-7. Start the rag containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
-
-   ```bash
-   docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d
-   ```
-
-   You can check the status of the rag-server and its dependencies by issuing this curl command
-   ```bash
-   curl -X 'GET' 'http://workstation_ip:8081/v1/health?check_dependencies=true' -H 'accept: application/json'
-   ```
-
-8. Confirm all the below mentioned containers are running.
-
-   ```bash
-   docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
-   ```
-
-   *Example Output*
-
-   ```output
-   NAMES                                   STATUS
-   compose-nv-ingest-ms-runtime-1          Up 5 minutes (healthy)
-   ingestor-server                         Up 5 minutes
-   compose-redis-1                         Up 5 minutes
-   rag-playground                          Up 9 minutes
-   rag-server                              Up 9 minutes
-   milvus-standalone                       Up 36 minutes
-   milvus-minio                            Up 35 minutes (healthy)
-   milvus-etcd                             Up 35 minutes (healthy)
-   nemoretriever-ranking-ms                Up 38 minutes (healthy)
-   compose-page-elements-1                 Up 38 minutes
-   compose-paddle-1                        Up 38 minutes
-   compose-graphic-elements-1              Up 38 minutes
-   compose-table-structure-1               Up 38 minutes
-   nemoretriever-embedding-ms              Up 38 minutes (healthy)
-   nim-llm-ms                              Up 38 minutes (healthy)
-   ```
-
-9.  Open a web browser and access `http://localhost:8090` to use the RAG Playground. You can use the upload tab to ingest files into the server or follow [the notebooks](../notebooks/) to understand the API usage.
-
-10. To stop all running services, after making some [customizations](#next-steps)
-    ```bash
-    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down
-    docker compose -f deploy/compose/nims.yaml down
-    docker compose -f deploy/compose/docker-compose-rag-server.yaml down
-    docker compose -f deploy/compose/vectordb.yaml down
-    ```
-
-**📝 Notes:**
-
-1. A single NVIDIA A100-80GB or H100-80GB, B200 GPU can be used to start non-LLM NIMs (nemoretriever-embedding-ms, nemoretriever-ranking-ms, and ingestion services like page-elements, paddle, graphic-elements, and table-structure) for ingestion and RAG workflows. You can control which GPU is used for each service by setting these environment variables in `deploy/compose/.env` file before launching:
-   ```bash
-   EMBEDDING_MS_GPU_ID=0
-   RANKING_MS_GPU_ID=0
-   YOLOX_MS_GPU_ID=0
-   YOLOX_GRAPHICS_MS_GPU_ID=0
-   YOLOX_TABLE_MS_GPU_ID=0
-   PADDLE_MS_GPU_ID=0
-   ```
-
-2. If the NIMs are deployed in a different workstation or outside the nvidia-rag docker network on the same system, replace the host address of the below URLs with workstation IPs.
-
-   ```bash
-   APP_EMBEDDINGS_SERVERURL="workstation_ip:8000"
-   APP_LLM_SERVERURL="workstation_ip:8000"
-   APP_RANKING_SERVERURL="workstation_ip:8000"
-   PADDLE_GRPC_ENDPOINT="workstation_ip:8001"
-   YOLOX_GRPC_ENDPOINT="workstation_ip:8001"
-   YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT="workstation_ip:8001"
-   YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT="workstation_ip:8001"
-   ```
-
-3. Due to react limitations, any changes made to below environment variables will require developers to rebuilt the rag containers. This will be fixed in a future release.
-
-   ```output
-   # Model name for LLM
-   NEXT_PUBLIC_MODEL_NAME: ${APP_LLM_MODELNAME:-meta/llama-3.1-70b-instruct}
-   # Model name for embeddings
-   NEXT_PUBLIC_EMBEDDING_MODEL: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-   # Model name for reranking
-   NEXT_PUBLIC_RERANKER_MODEL: ${APP_RANKING_MODELNAME:-nvidia/llama-3.2-nv-rerankqa-1b-v2}
-   # URL for rag server container
-   NEXT_PUBLIC_CHAT_BASE_URL: "http://rag-server:8081/v1"
-   # URL for ingestor container
-   NEXT_PUBLIC_VDB_BASE_URL: "http://ingestor-server:8082/v1"
-   ```
-
-
-### Start using nvidia hosted models
-
-1. Verify that you meet the [prerequisites](#prerequisites).
-
-2. Open `deploy/.env` and uncomment the section `Endpoints for using cloud NIMs`.
-   Then set the environment variables by executing below command.
-   ```bash
-   source deploy/.env
-   ```
-
-
-   **📝 Note:**
-   When using NVIDIA hosted endpoints, you may encounter rate limiting with larger file ingestions (>10 files).
-
-3. Start the vector db containers from the repo root.
-   ```bash
-   docker compose -f deploy/vectordb.yaml up -d
-   ```
-   [!NOTE]
-   If you don't have a GPU available, you can switch to CPU-only Milvus by following the instructions in [milvus-configuration.md](./milvus-configuration.md).
-
-   [!TIP]
-   For B200 and A100 GPUs, use Milvus CPU indexing due to known retrieval accuracy issues with Milvus GPU indexing and search. Export following environment variables to disable Milvus GPU ingexing and search.
-   ```bash
-   export APP_VECTORSTORE_ENABLEGPUSEARCH=False
-   export APP_VECTORSTORE_ENABLEGPUINDEX=False
-   ```
-
-4. Start the ingestion containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
-
-   ```bash
-   docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d
-   ```
-
-   [!TIP]
-   You can add a `--build` argument in case you have made some code changes or have any requirement of re-building ingestion containers from source code:
-
-   ```bash
-   docker compose -f deploy/compose/docker-compose-ingestor-server.yaml up -d --build
-   ```
-
-5. Start the rag containers from the repo root. This pulls the prebuilt containers from NGC and deploys it on your system.
-
-   ```bash
-   docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d
-   ```
-
-   [!TIP]
-   You can add a `--build` argument in case you have made some code changes or have any requirement of re-building containers from source code:
-
-   ```bash
-   docker compose -f deploy/compose/docker-compose-rag-server.yaml up -d --build
-   ```
-
-   You can check the status of the rag-server and its dependencies by issuing this curl command
-   ```bash
-   curl -X 'GET' 'http://workstation_ip:8081/v1/health?check_dependencies=true' -H 'accept: application/json'
-   ```
-
-6. Confirm all the below mentioned containers are running.
-
-   ```bash
-   docker ps --format "table {{.ID}}\t{{.Names}}\t{{.Status}}"
-   ```
-
-   *Example Output*
-
-   ```output
-   NAMES                                   STATUS
-   compose-nv-ingest-ms-runtime-1          Up 5 minutes (healthy)
-   ingestor-server                         Up 5 minutes
-   compose-redis-1                         Up 5 minutes
-   rag-playground                          Up 9 minutes
-   rag-server                              Up 9 minutes
-   milvus-standalone                       Up 36 minutes
-   milvus-minio                            Up 35 minutes (healthy)
-   milvus-etcd                             Up 35 minutes (healthy)
-   ```
-
-7. Open a web browser and access `http://localhost:8090` to use the RAG Playground. You can use the upload tab to ingest files into the server or follow [the notebooks](../notebooks/) to understand the API usage.
-
-8. To stop all running services, after making some [customizations](#next-steps)
-    ```bash
-    docker compose -f deploy/compose/docker-compose-ingestor-server.yaml down
-    docker compose -f deploy/compose/docker-compose-rag-server.yaml down
-    docker compose -f deploy/compose/vectordb.yaml down
-
-Open the RAG Playground at localhost:3080, create a new collection and save it. or you can use the API for that, see API Usage examples below:
-
-## API Usage Examples
-
-### Setup Logging
-
-```python
-import logging
-
-LOGLEVEL = logging.WARNING  # Set to INFO, DEBUG, WARNING or ERROR
-logging.basicConfig(level=LOGLEVEL)
-
-for name in logging.root.manager.loggerDict:
-    if name == "nvidia_rag" or name.startswith("nvidia_rag."):
-        logging.getLogger(name).setLevel(LOGLEVEL)
-    if name == "nv_ingest_client" or name.startswith("nv_ingest_client."):
-        logging.getLogger(name).setLevel(LOGLEVEL)
-```
-
-### Import Packages
-
-```python
-from nvidia_rag import NvidiaRAG, NvidiaRAGIngestor
-
-rag = NvidiaRAG()
-ingestor = NvidiaRAGIngestor()
-```
-
-## Collection Management
-
-### Create a New Collection
-
-```python
-response = ingestor.create_collection(
-    collection_name="test_library",
-    vdb_endpoint="http://localhost:19530",
-    # Optional: Create collection with metadata schema
-    # metadata_schema = [
-    #     {
-    #         "name": "meta_field_1",
-    #         "type": "string", 
-    #         "description": "Description field for the document"
-    #     }
-    # ]
-)
-print(response)
-```
-
-### List All Collections
-
-```python
-response = ingestor.get_collections(vdb_endpoint="http://localhost:19530")
-print(response)
-```
-
-### Delete Collections
-
-```python
-response = ingestor.delete_collections(
-    vdb_endpoint="http://localhost:19530", 
-    collection_names=["test_library"]
-)
-print(response)
-```
-
-## Document Operations
-
-### Upload Documents
-
-```python
-response = await ingestor.upload_documents(
-    collection_name="test_library",
-    vdb_endpoint="http://localhost:19530",
-    blocking=False,
-    split_options={"chunk_size": 512, "chunk_overlap": 150},
-    filepaths=[
-        "examples/RAG/libraary_rag/data/cuda.txt",
-    ],
-    generate_summary=False,
-    
-)
-print(response)
-```
-
-### Check Upload Status
-
-```python
-response = await ingestor.status(task_id="YOUR_TASK_ID_HERE")
-print(response)
-```
-
-### Update Documents
-
-```python
-response = await ingestor.update_documents(
-    collection_name="test_library",
-    vdb_endpoint="http://localhost:19530", 
-    blocking=False,
-    filepaths=["../data/multimodal/woods_frost.docx"],
-    generate_summary=False,
-)
-print(response)
-```
-
-### List Documents in Collection
-
-```python
-response = ingestor.get_documents(
-    collection_name="test_library",
-    vdb_endpoint="http://localhost:19530",
-)
-print(response)
-```
-
-### Delete Documents
-
-```python
-response = ingestor.delete_documents(
-    collection_name="test_library",
-    document_names=["../data/multimodal/multimodal_test.pdf"],
-    vdb_endpoint="http://localhost:19530",
-)
-print(response)
-```
-
-
-#### Configure Your Agent
-
-Configure your Agent to use the Milvus collections for RAG. We have pre-configured a configuration file for you in `examples/RAG/simple_rag/configs/milvus_rag_config.yml`. You can modify this file to point to your Milvus instance and collections or add tools to your agent. The agent, by default, is a `tool_calling` agent that can be used to interact with the retriever component. The configuration file is shown below. You can also modify your agent to be another one of the NeMo Agent toolkit pre-built agent implementations such as the `react_agent`
-
-    ```yaml
-    general:
-  use_uvloop: true
-
-
-functions:
-  library_rag_tool:
-    _type: library_rag
-    base_url: "http://localhost:8081"
-    reranker_top_k: 2
-    vdb_top_k: 10
-    vdb_endpoint: "http://milvus:19530"
-    collection_names: ["cuda"]
-    enable_query_rewriting: True
-    enable_reranker: True
-
-    #description: Retrieve documents given the input query
-
-llms:
-  nim_llm:
-    _type: nim
-    model_name: meta/llama-3.3-70b-instruct
-    temperature: 0
-    max_tokens: 4096
-    top_p: 1
-
-workflow:
-  _type: tool_calling_agent
-  tool_names:
-   - library_rag_tool
-  llm_name: nim_llm
-  verbose: true
-    ```
-
-    If you have a different Milvus instance or collection names, you can modify the `vdb_url` and the `collection_names` in the config file to point to your instance and collections. 
-    You can also modify the retrieval parameters like `vdb_top_k`, ...
-    You can also add additional functions as tools for your agent in the `functions` section.
-
-#### Run the Workflow
-
-```bash
-nat run --config_file examples/RAG/library_rag/configs/config.yml --input "How do I install CUDA"
-```
-
-The expected workflow result of running the above command is:
-```console
-['To install CUDA, you typically need to: \n1. Verify you have a CUDA-capable GPU and a supported version of your operating system.\n2. Download the NVIDIA CUDA Toolkit from the official NVIDIA website.\n3. Choose an installation method, such as a local repository installation or a network repository installation, depending on your system.\n4. Follow the specific instructions for your operating system, which may include installing local repository packages, enabling network repositories, or running installer scripts.\n5. Reboot your system and perform post-installation actions, such as setting up your environment and verifying the installation by running sample projects. \n\nPlease refer to the official NVIDIA CUDA documentation for detailed instructions tailored to your specific operating system and distribution.']
-
-
-
diff --git a/examples/RAG/library_rag/configs/config.yml b/examples/RAG/library_rag/configs/config.yml
deleted file mode 100644
index d4bd4351b..000000000
--- a/examples/RAG/library_rag/configs/config.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-general:
-  use_uvloop: true
-
-
-functions:
-  library_rag_tool:
-    _type: library_rag
-    base_url: "http://localhost:8081"
-    reranker_top_k: 2
-    vdb_top_k: 10
-    vdb_endpoint: "http://milvus:19530"
-    collection_names: ["cuda"]
-    enable_query_rewriting: True
-    enable_reranker: True
-
-    #description: Retrieve documents given the input query
-
-llms:
-  nim_llm:
-    _type: nim
-    model_name: meta/llama-3.3-70b-instruct
-    temperature: 0
-    max_tokens: 4096
-    top_p: 1
-
-workflow:
-  _type: tool_calling_agent
-  tool_names:
-   - library_rag_tool
-  llm_name: nim_llm
-  verbose: true
\ No newline at end of file
diff --git a/examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml b/examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml
deleted file mode 100644
index f13ab4a83..000000000
--- a/examples/RAG/library_rag/deploy/docker-compose-ingestor-server.yaml
+++ /dev/null
@@ -1,211 +0,0 @@
-services:
-
-  # Main ingestor server which is responsible for ingestion
-  ingestor-server:
-    container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../
-      dockerfile: ./src/nvidia_rag/ingestor_server/Dockerfile
-    # start the server on port 8082 with 4 workers for improved latency on concurrent requests.
-    command: --port 8082 --host 0.0.0.0 --workers 1
-
-    volumes:
-      # Mount the prompt.yaml file to the container, path should be absolute
-      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
-
-    # Common customizations to the pipeline can be controlled using env variables
-    environment:
-      # Path to example directory relative to root
-      EXAMPLE_PATH: 'src/nvidia_rag/ingestor_server'
-
-      # Absolute path to custom prompt.yaml file
-      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
-
-      ##===Vector DB specific configurations===
-      # URL on which vectorstore is hosted
-      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
-      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
-      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
-      # Type of vectordb search to be used
-      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
-      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
-      # Boolean to control GPU search for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
-      # vectorstore collection name to store embeddings
-      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
-
-      ##===MINIO specific configurations===
-      MINIO_ENDPOINT: "minio:9010"
-      MINIO_ACCESSKEY: "minioadmin"
-      MINIO_SECRETKEY: "minioadmin"
-
-      NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-
-      ##===Embedding Model specific configurations===
-      # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
-      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
-      APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
-
-      ##===NV-Ingest Connection Configurations=======
-      APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
-      APP_NVINGEST_MESSAGECLIENTPORT: ${APP_NVINGEST_MESSAGECLIENTPORT:-7670}
-
-      ##===NV-Ingest Extract Configurations==========
-      APP_NVINGEST_EXTRACTTEXT: ${APP_NVINGEST_EXTRACTTEXT:-True}
-      APP_NVINGEST_EXTRACTINFOGRAPHICS: ${APP_NVINGEST_EXTRACTINFOGRAPHICS:-False}
-      APP_NVINGEST_EXTRACTTABLES: ${APP_NVINGEST_EXTRACTTABLES:-True}
-      APP_NVINGEST_EXTRACTCHARTS: ${APP_NVINGEST_EXTRACTCHARTS:-True}
-      APP_NVINGEST_EXTRACTIMAGES: ${APP_NVINGEST_EXTRACTIMAGES:-False}
-      APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
-      APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
-      APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
-      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretriever_parse, None
-      # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
-      APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
-
-      ##===NV-Ingest Splitting Configurations========
-      APP_NVINGEST_CHUNKSIZE: ${APP_NVINGEST_CHUNKSIZE:-512}
-      APP_NVINGEST_CHUNKOVERLAP: ${APP_NVINGEST_CHUNKOVERLAP:-150}
-      APP_NVINGEST_ENABLEPDFSPLITTER: ${APP_NVINGEST_ENABLEPDFSPLITTER:-True}
-      APP_NVINGEST_SEGMENTAUDIO: ${APP_NVINGEST_SEGMENTAUDIO:-False} # Enable audio segmentation for NV Ingest
-
-      ##===NV-Ingest Caption Model configurations====
-      APP_NVINGEST_CAPTIONMODELNAME: ${APP_NVINGEST_CAPTIONMODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
-      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1
-      APP_NVINGEST_CAPTIONENDPOINTURL: ${APP_NVINGEST_CAPTIONENDPOINTURL:-"http://vlm-ms:8000/v1/chat/completions"}
-
-      # Choose whether to store the extracted content in the vector store for citation support
-      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
-
-      # Choose the summary model to use for document summary
-      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3_3-nemotron-super-49b-v1_5}
-      SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-"nim-llm:8000"}
-      SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-50000}
-      SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-200}
-      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
-      LOGLEVEL: ${LOGLEVEL:-INFO}
-
-      # [Optional] Redis configuration for task status and result storage
-      REDIS_HOST: ${REDIS_HOST:-redis}
-      REDIS_PORT: ${REDIS_PORT:-6379}
-      REDIS_DB: ${REDIS_DB:-0}
-
-      # Bulk upload to MinIO
-      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
-      TEMP_DIR: ${TEMP_DIR:-/tmp-data}
-
-      # NV-Ingest Batch Mode Configurations
-      NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
-      NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
-
-    ports:
-      - "8082:8082"
-    expose:
-      - "8082"
-    shm_size: 5gb
-
-  redis:
-    image: "redis/redis-stack:7.2.0-v18"
-    ports:
-      - "6379:6379"
-
-  nv-ingest-ms-runtime:
-    image: nvcr.io/nvstaging/nim/nv-ingest:25.8.0-RC6
-    cpuset: "0-15"
-    volumes:
-      - ${DATASET_ROOT:-./data}:/workspace/data
-    ports:
-      # HTTP API
-      - "7670:7670"
-      # Simple Broker
-      - "7671:7671"
-    cap_add:
-      - sys_nice
-    environment:
-      # Audio model not used in this RAG version
-      - AUDIO_GRPC_ENDPOINT=audio:50051
-      - AUDIO_INFER_PROTOCOL=grpc
-      - CUDA_VISIBLE_DEVICES=0
-      - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
-      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}}
-      # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
-      - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}}
-      # - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-vlm-embedding-ms:8000/v1}}
-      - INGEST_LOG_LEVEL=WARNING
-      - INGEST_RAY_LOG_LEVEL=PRODUCTION
-      - INGEST_EDGE_BUFFER_SIZE=64
-      - INGEST_DYNAMIC_MEMORY_THRESHOLD=0.8
-      - INGEST_DISABLE_DYNAMIC_SCALING=${INGEST_DISABLE_DYNAMIC_SCALING:-True}
-      - INSTALL_AUDIO_EXTRACTION_DEPS=true
-      # Message client for development
-      #- MESSAGE_CLIENT_HOST=0.0.0.0
-      #- MESSAGE_CLIENT_PORT=7671
-      #- MESSAGE_CLIENT_TYPE=simple # Configure the ingest service to use the simple broker
-      # Message client for production
-      - MESSAGE_CLIENT_HOST=redis
-      - MESSAGE_CLIENT_PORT=6379
-      - MESSAGE_CLIENT_TYPE=redis
-      - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
-      - MRC_IGNORE_NUMA_CHECK=1
-      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=${NEMORETRIEVER_PARSE_HTTP_ENDPOINT:-http://nemoretriever-parse:8000/v1/chat/completions}
-      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=${NEMORETRIEVER_PARSE_INFER_PROTOCOL:-http}
-      - NEMORETRIEVER_PARSE_MODEL_NAME=${NEMORETRIEVER_PARSE_MODEL_NAME:-nvidia/nemoretriever-parse}
-      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NVIDIA_BUILD_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
-      - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
-      # Self-hosted ocr endpoints.
-      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-${PADDLE_GRPC_ENDPOINT:-paddle:8001}}
-      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-${PADDLE_HTTP_ENDPOINT:-http://paddle:8000/v1/infer}}
-      - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-${PADDLE_INFER_PROTOCOL:-grpc}}
-      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-paddle}
-      # build.nvidia.com hosted ocr endpoints.
-      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
-      #- OCR_INFER_PROTOCOL=http
-      - READY_CHECK_ALL_COMPONENTS=False
-      - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
-      # Self-hosted redis endpoints.
-      # build.nvidia.com hosted yolox endpoints.
-      #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2
-      #- YOLOX_INFER_PROTOCOL=http
-      - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
-      - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
-      - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
-      # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
-      #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
-      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
-      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
-      # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
-      #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
-      - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
-      - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
-      - YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=${YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL:-grpc}
-      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1/chat/completions
-      - VLM_CAPTION_ENDPOINT=${VLM_CAPTION_ENDPOINT:-http://vlm-ms:8000/v1/chat/completions}
-      - VLM_CAPTION_MODEL_NAME=${VLM_CAPTION_MODEL_NAME:-nvidia/llama-3.1-nemotron-nano-vl-8b-v1}
-      - MODEL_PREDOWNLOAD_PATH=${MODEL_PREDOWNLOAD_PATH:-/workspace/models/}
-    healthcheck:
-      test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
-      interval: 10s
-      timeout: 5s
-      retries: 20
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml b/examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml
deleted file mode 100644
index 334623c04..000000000
--- a/examples/RAG/library_rag/deploy/docker-compose-rag-server.yaml
+++ /dev/null
@@ -1,186 +0,0 @@
-services:
-
-  # Main orchestrator server which stiches together all calls to different services to fulfill the user request
-  rag-server:
-    container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../
-      dockerfile: src/nvidia_rag/rag_server/Dockerfile
-    # start the server on port 8081 with 8 workers for improved latency on concurrent requests.
-    command: --port 8081 --host 0.0.0.0 --workers 8
-    volumes:
-      # Mount the prompt.yaml file to the container, path should be absolute
-      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
-    # Common customizations to the pipeline can be controlled using env variables
-    environment:
-      # Path to example directory relative to root
-      EXAMPLE_PATH: './nvidia_rag/rag_server'
-
-      # Absolute path to custom prompt.yaml file
-      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
-
-      ##===MINIO specific configurations which is used to store the multimodal base64 content===
-      MINIO_ENDPOINT: "minio:9010"
-      MINIO_ACCESSKEY: "minioadmin"
-      MINIO_SECRETKEY: "minioadmin"
-
-      ##===Vector DB specific configurations===
-      # URL on which vectorstore is hosted
-      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
-      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
-      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
-      # Type of index to be used for vectorstore
-      APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
-      # Type of vectordb search to be used
-      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
-      # vectorstore collection name to store embeddings
-      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
-      APP_RETRIEVER_SCORETHRESHOLD: 0.25
-      # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
-      VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
-
-      ##===LLM Model specific configurations===
-      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # url on which llm model is hosted. If "", Nvidia hosted API is used
-      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
-
-      ##===Query Rewriter Model specific configurations===
-      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
-      # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
-      APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b:8000"}
-
-      ##===Filter Expression Generator Model specific configurations===
-      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
-      APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
-      # enable filter expression generator for natural language to filter expression conversion
-      ENABLE_FILTER_GENERATOR: ${ENABLE_FILTER_GENERATOR:-False}
-
-      ##===Embedding Model specific configurations===
-      # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
-      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
-
-      ##===Reranking Model specific configurations===
-      # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
-      ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
-      # Default confidence threshold for filtering documents by reranker relevance scores (0.0 to 1.0)
-      RERANKER_CONFIDENCE_THRESHOLD: ${RERANKER_CONFIDENCE_THRESHOLD:-0.0}
-
-      ##===VLM Model specific configurations===
-      ENABLE_VLM_INFERENCE: ${ENABLE_VLM_INFERENCE:-false}
-      # Reasoning gate on VLM response: off by default; enable to mitigate incorrect VLM outputs
-      ENABLE_VLM_RESPONSE_REASONING: ${ENABLE_VLM_RESPONSE_REASONING:-false}
-      # Max images sent to VLM per request (query + context)
-      APP_VLM_MAX_TOTAL_IMAGES: ${APP_VLM_MAX_TOTAL_IMAGES:-4}
-      # Max number of query images to include in VLM input
-      APP_VLM_MAX_QUERY_IMAGES: ${APP_VLM_MAX_QUERY_IMAGES:-1}
-      # Max number of context images to include in VLM input
-      APP_VLM_MAX_CONTEXT_IMAGES: ${APP_VLM_MAX_CONTEXT_IMAGES:-1}
-      APP_VLM_SERVERURL: ${APP_VLM_SERVERURL-"http://vlm-ms:8000/v1"}
-      APP_VLM_MODELNAME: ${APP_VLM_MODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
-
-      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-
-      # Number of document chunks to insert in LLM prompt, used only when ENABLE_RERANKER is set to True
-      APP_RETRIEVER_TOPK: ${APP_RETRIEVER_TOPK:-10}
-
-      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
-      LOGLEVEL: ${LOGLEVEL:-INFO}
-
-      # enable multi-turn conversation in the rag chain - this controls conversation history usage
-      # while doing query rewriting and in LLM prompt
-      ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
-
-      # enable query rewriting for multiturn conversation in the rag chain.
-      # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
-      ENABLE_QUERYREWRITER: ${ENABLE_QUERYREWRITER:-False}
-
-      # Choose whether to enable citations in the response
-      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
-
-      # Choose whether to enable/disable guardrails
-      ENABLE_GUARDRAILS: ${ENABLE_GUARDRAILS:-False}
-
-      # NeMo Guardrails URL when ENABLE_GUARDRAILS is true
-      NEMO_GUARDRAILS_URL: ${NEMO_GUARDRAILS_URL:-nemo-guardrails-microservice:7331}
-
-      # number of last n chat messages to consider from the provided conversation history
-      CONVERSATION_HISTORY: 5
-
-      # Tracing
-      APP_TRACING_ENABLED: "False"
-      # HTTP endpoint
-      APP_TRACING_OTLPHTTPENDPOINT: http://otel-collector:4318/v1/traces
-      # GRPC endpoint
-      APP_TRACING_OTLPGRPCENDPOINT: grpc://otel-collector:4317
-
-      # Choose whether to enable source metadata in document content during generation
-      ENABLE_SOURCE_METADATA: ${ENABLE_SOURCE_METADATA:-true}
-
-      # Whether to filter content within <think></think> tags in model responses
-      FILTER_THINK_TOKENS: ${FILTER_THINK_TOKENS:-true}
-
-      # Whether to enable thinking in the rag chain for llama-3.3-nemotron-super-49b model
-      ENABLE_NEMOTRON_THINKING: ${ENABLE_NEMOTRON_THINKING:-false}
-
-      # enable reflection (context relevance and response groundedness checking) in the rag chain
-      ENABLE_REFLECTION: ${ENABLE_REFLECTION:-false}
-      # Maximum number of context relevance loop iterations
-      MAX_REFLECTION_LOOP: ${MAX_REFLECTION_LOOP:-3}
-      # Minimum relevance score threshold (0-2)
-      CONTEXT_RELEVANCE_THRESHOLD: ${CONTEXT_RELEVANCE_THRESHOLD:-1}
-      # Minimum groundedness score threshold (0-2)
-      RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
-      # reflection llm
-      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # reflection llm server url. If "", Nvidia hosted API is used
-      REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
-      # enable iterative query decomposition
-      ENABLE_QUERY_DECOMPOSITION: ${ENABLE_QUERY_DECOMPOSITION:-false}
-      # maximum recursion depth for iterative query decomposition
-      MAX_RECURSION_DEPTH: ${MAX_RECURSION_DEPTH:-3}
-
-    ports:
-      - "8081:8081"
-    expose:
-      - "8081"
-    shm_size: 5gb
-
-  # Sample UI container which interacts with APIs exposed by rag-server container
-  rag-playground:
-    container_name: rag-playground
-    image: nvcr.io/nvstaging/blueprint/rag-playground:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../frontend
-      dockerfile: ./Dockerfile
-      args:
-        # Environment variables for Vite build
-        VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
-        VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-        VITE_MILVUS_URL: http://milvus:19530
-    ports:
-      - "8090:3000"
-    expose:
-      - "3000"
-    environment:
-      # Runtime environment variables for Vite
-      VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
-      VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-      VITE_MILVUS_URL: http://milvus:19530
-    depends_on:
-      - rag-server
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/RAG/library_rag/deploy/nims.yaml b/examples/RAG/library_rag/deploy/nims.yaml
deleted file mode 100644
index 9fafe77a3..000000000
--- a/examples/RAG/library_rag/deploy/nims.yaml
+++ /dev/null
@@ -1,348 +0,0 @@
-services:
-  nim-llm:
-    container_name: nim-llm-ms
-    image: nvcr.io/nim/nvidia/llama-3_3-nemotron-super-49b-v1_5:1.12.0
-    volumes:
-    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
-    user: "${USERID}"
-    ports:
-    - "8999:8000"
-    expose:
-    - "8000"
-    environment:
-      NGC_API_KEY: ${NGC_API_KEY}
-    shm_size: 20gb
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              #count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${LLM_MS_GPU_ID:-1}']
-              capabilities: [gpu]
-    healthcheck:
-      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8000/v1/health/ready')"]
-      interval: 10s
-      timeout: 20s
-      retries: 100
-    profiles: ["", "rag"]
-
-  nemoretriever-embedding-ms:
-    container_name: nemoretriever-embedding-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-embedqa-1b-v2:1.9.0
-    volumes:
-    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
-    ports:
-    - "9080:8000"
-    expose:
-    - "8000"
-    environment:
-      NGC_API_KEY: ${NGC_API_KEY}
-      NIM_TRT_ENGINE_HOST_CODE_ALLOWED: 1
-    user: "${USERID}"
-    shm_size: 16GB
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${EMBEDDING_MS_GPU_ID:-0}']
-              capabilities: [gpu]
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-      start_period: 10m
-    profiles: ["", "rag", "ingest", "text-embed"]
-
-  nemoretriever-vlm-embedding-ms:
-    container_name: nemoretriever-vlm-embedding-ms
-    image: nvcr.io/nvidia/nemo-microservices/llama-3.2-nemoretriever-1b-vlm-embed-v1:1.7.0
-    volumes:
-    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
-    ports:
-    - "9081:8000"
-    expose:
-    - "8000"
-    environment:
-      NGC_API_KEY: ${NGC_API_KEY}
-      NIM_TRT_ENGINE_HOST_CODE_ALLOWED: 1
-    user: "${USERID}"
-    shm_size: 16GB
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${VLM_EMBEDDING_MS_GPU_ID:-0}']
-              capabilities: [gpu]
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-      start_period: 10m
-    profiles: ["vlm-embed"]
-
-  nemoretriever-ranking-ms:
-    container_name: nemoretriever-ranking-ms
-    image: nvcr.io/nim/nvidia/llama-3.2-nv-rerankqa-1b-v2:1.7.0
-    volumes:
-    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
-    ports:
-    - "1976:8000"
-    expose:
-    - "8000"
-    environment:
-      NGC_API_KEY: ${NGC_API_KEY}
-    user: "${USERID}"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8000/v1/health/ready"]
-      interval: 10s
-      timeout: 20s
-      retries: 100
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${RANKING_MS_GPU_ID:-0}']
-              capabilities: [gpu]
-    profiles: ["", "rag"]
-
-  page-elements:
-    image: ${YOLOX_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-page-elements-v2}:${YOLOX_TAG:-1.4.0}
-    ports:
-      - "8000:8000"
-      - "8001:8001"
-      - "8002:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - CUDA_VISIBLE_DEVICES=0
-      - NIM_TRITON_MODEL_BATCH_SIZE=${PAGE_ELEMENTS_BATCH_SIZE:-1}
-      # NIM OpenTelemetry Settings
-      - NIM_OTEL_SERVICE_NAME=page-elements
-      - NIM_OTEL_TRACES_EXPORTER=otlp
-      - NIM_OTEL_METRICS_EXPORTER=console
-      - NIM_OTEL_EXPORTER_OTLP_ENDPOINT=http://otel-collector:4318
-      - NIM_ENABLE_OTEL=true
-      # Triton OpenTelemetry Settings
-      - TRITON_OTEL_URL=http://otel-collector:4318/v1/traces
-      - TRITON_OTEL_RATE=1
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ['${YOLOX_MS_GPU_ID:-0}']
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["", "ingest"]
-
-  graphic-elements:
-    image: ${YOLOX_GRAPHIC_ELEMENTS_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-graphic-elements-v1}:${YOLOX_GRAPHIC_ELEMENTS_TAG:-1.4.0}
-    ports:
-      - "8003:8000"
-      - "8004:8001"
-      - "8005:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - CUDA_VISIBLE_DEVICES=0
-      - NIM_TRITON_MODEL_BATCH_SIZE=${GRAPHIC_ELEMENTS_BATCH_SIZE:-1}
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ['${YOLOX_GRAPHICS_MS_GPU_ID:-0}']
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["", "ingest"]
-
-  table-structure:
-    image: ${YOLOX_TABLE_STRUCTURE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-table-structure-v1}:${YOLOX_TABLE_STRUCTURE_TAG:-1.4.0}
-    ports:
-      - "8006:8000"
-      - "8007:8001"
-      - "8008:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - CUDA_VISIBLE_DEVICES=0
-      - NIM_TRITON_MODEL_BATCH_SIZE=${TABLE_STRUCTURE_BATCH_SIZE:-1}
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids:  ['${YOLOX_TABLE_MS_GPU_ID:-0}']
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["", "ingest"]
-
-  paddle:
-    image: ${PADDLE_IMAGE:-nvcr.io/nim/baidu/paddleocr}:${PADDLE_TAG:-1.4.0}
-    shm_size: 2gb
-    ports:
-      - "8009:8000"
-      - "8010:8001"
-      - "8011:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - CUDA_VISIBLE_DEVICES=0
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids:  ['${OCR_MS_GPU_ID:-${PADDLE_MS_GPU_ID:-0}}']
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["", "ingest"]
-
-  nemoretriever-ocr:
-    image: ${NEMORETRIEVER_OCR_IMAGE:-nvcr.io/nvidia/nemo-microservices/nemoretriever-ocr-v1}:${NEMORETRIEVER_OCR_TAG:-1.0.0}
-    shm_size: 2gb
-    ports:
-      - "8012:8000"
-      - "8013:8001"
-      - "8014:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - CUDA_VISIBLE_DEVICES=0
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ["${OCR_MS_GPU_ID:-${PADDLE_MS_GPU_ID:-0}}"]
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["nemoretriever-ocr"]
-
-  # Optional NIM microservices
-  nemoretriever-parse:
-    image: ${NEMORETRIEVER_PARSE_IMAGE:-nvcr.io/nim/nvidia/nemoretriever-parse}:${NEMORETRIEVER_PARSE_TAG:-1.2}
-    ports:
-      - "8015:8000"
-      - "8016:8001"
-      - "8017:8002"
-    user: root
-    environment:
-      - NIM_HTTP_API_PORT=8000
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NVIDIA_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - CUDA_VISIBLE_DEVICES=0
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ['${NEMORETRIEVER_PARSE_MS_GPU_ID:-1}']
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["nemoretriever-parse"]
-
-  audio:
-    image: ${AUDIO_IMAGE:-nvcr.io/nim/nvidia/riva-asr}:${AUDIO_TAG:-1.3.0}
-    shm_size: 2gb
-    ports:
-      - "8021:50051"  # grpc
-      - "8022:9000"  # http
-    user: root
-    environment:
-      - NIM_TAGS_SELECTOR=name=parakeet-1-1b-ctc-riva-en-us,mode=ofl
-      - NIM_TRITON_LOG_VERBOSE=1
-      - NGC_API_KEY=${NIM_NGC_API_KEY:-${NGC_API_KEY:-ngcapikey}}
-      - CUDA_VISIBLE_DEVICES=0
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              device_ids: ["${AUDIO_MS_GPU_ID:-0}"]
-              capabilities: [gpu]
-    runtime: nvidia
-    profiles: ["audio"]
-
-  vlm-ms:
-    container_name: nemo-vlm-microservice
-    image: nvcr.io/nim/nvidia/llama-3.1-nemotron-nano-vl-8b-v1:1.3.1
-    volumes:
-    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
-    ports:
-    - "1977:8000"
-    expose:
-    - "8000"
-    environment:
-      NGC_API_KEY: ${NGC_API_KEY}
-    user: "${USERID}"
-    healthcheck:
-      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8000/v1/health/ready')"]
-      interval: 10s
-      timeout: 20s
-      retries: 100
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${VLM_MS_GPU_ID:-5}']
-              capabilities: [gpu]
-    profiles: ["vlm"]
-
-  nim-llm-llama-8b:
-    container_name: nim-llm-llama-8b
-    image: nvcr.io/nim/meta/llama-3.1-8b-instruct:1.8.6
-    volumes:
-    - ${MODEL_DIRECTORY:-./}:/opt/nim/.cache
-    user: "${USERID}"
-    ports:
-    - "8991:8000"
-    expose:
-    - "8000"
-    environment:
-      NGC_API_KEY: ${NGC_API_KEY}
-    shm_size: 20gb
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              #count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${LLM_8B_MS_GPU_ID:-6}']
-              capabilities: [gpu]
-    healthcheck:
-      test: ["CMD", "python3", "-c", "import requests; requests.get('http://localhost:8000/v1/health/ready')"]
-      interval: 10s
-      timeout: 20s
-      retries: 100
-    profiles: ["llama-8b"]
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/RAG/library_rag/deploy/vectordb.yaml b/examples/RAG/library_rag/deploy/vectordb.yaml
deleted file mode 100644
index ed9bf8403..000000000
--- a/examples/RAG/library_rag/deploy/vectordb.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-services:
-
-  # Milvus can be made GPU accelerated by uncommenting the lines as specified below
-  milvus:
-    container_name: milvus-standalone
-    image: milvusdb/milvus:${MILVUS_VERSION:-v2.6.0-gpu} # milvusdb/milvus:v2.6.0 for CPU
-    command: ["milvus", "run", "standalone"]
-    environment:
-      ETCD_ENDPOINTS: etcd:2379
-      MINIO_ADDRESS: minio:9010
-      KNOWHERE_GPU_MEM_POOL_SIZE: 2048;4096
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
-    # healthcheck:
-    #   test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
-    #   interval: 30s
-    #   start_period: 90s
-    #   timeout: 20s
-    #   retries: 3
-    ports:
-      - "19530:19530"
-      - "9091:9091"
-    depends_on:
-      - "etcd"
-      - "minio"
-    # Comment out this section if CPU based image is used and set below env variables to False
-    # export APP_VECTORSTORE_ENABLEGPUSEARCH=False
-    # export APP_VECTORSTORE_ENABLEGPUINDEX=False
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: ["gpu"]
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
-    profiles: ["", "milvus"]
-
-  etcd:
-    container_name: milvus-etcd
-    image: quay.io/coreos/etcd:v3.6.4
-    environment:
-      - ETCD_AUTO_COMPACTION_MODE=revision
-      - ETCD_AUTO_COMPACTION_RETENTION=1000
-      - ETCD_QUOTA_BACKEND_BYTES=4294967296
-      - ETCD_SNAPSHOT_COUNT=50000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
-    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
-    healthcheck:
-      test: ["CMD", "etcdctl", "endpoint", "health"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-    profiles: ["", "milvus"]
-
-  minio:
-    container_name: milvus-minio
-    image: minio/minio:RELEASE.2025-07-23T15-54-02Z
-    environment:
-      MINIO_ACCESS_KEY: minioadmin
-      MINIO_SECRET_KEY: minioadmin
-    ports:
-      - "9011:9011"
-      - "9010:9010"
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
-    command: minio server /minio_data --console-address ":9011" --address ":9010"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-    profiles: ["", "milvus", "elasticsearch", "minio"]
-
-  elasticsearch:
-    container_name: elasticsearch
-    image: "docker.elastic.co/elasticsearch/elasticsearch:9.0.3"
-    ports:
-      - 9200:9200
-    volumes:
-      # Run "sudo chown -R 1000:1000 deploy/compose/volumes/elasticsearch/" to fix permissions
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/elasticsearch:/usr/share/elasticsearch/data
-    restart: on-failure
-    environment:
-      - discovery.type=single-node
-      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
-      - xpack.security.enabled=false
-      - xpack.license.self_generated.type=basic
-      - network.host=0.0.0.0
-      - cluster.routing.allocation.disk.threshold_enabled=false
-    hostname: elasticsearch
-    healthcheck:
-      test: ["CMD", "curl", "-s", "-f", "http://localhost:9200/_cat/health"]
-      interval: 10s
-      timeout: 1s
-      retries: 10
-    profiles: ["elasticsearch"]
-
-networks:
-  default:
-    name: nvidia-rag
\ No newline at end of file
diff --git a/examples/RAG/library_rag/pyproject.toml b/examples/RAG/library_rag/pyproject.toml
deleted file mode 100644
index 3ebf38a33..000000000
--- a/examples/RAG/library_rag/pyproject.toml
+++ /dev/null
@@ -1,25 +0,0 @@
-[build-system]
-build-backend = "setuptools.build_meta"
-requires = ["setuptools >= 64", "setuptools-scm>=8"]
-
-[tool.setuptools_scm]
-# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
-# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
-git_describe_command = "git describe --long --first-parent"
-root = "../../.."
-
-[project]
-name = "library_rag"
-dynamic = ["version"]
-dependencies = [
-  "nvidia-nat[langchain]~=1.3",
-]
-requires-python = ">=3.11,<3.14"
-description = "Custom NeMo Agent Toolkit Workflow"
-classifiers = ["Programming Language :: Python"]
-
-[tool.uv.sources]
-nvidia-nat = { path = "../../..", editable = true }
-
-[project.entry-points.'nat.components']
-library_rag = "library_rag.register"
\ No newline at end of file
diff --git a/examples/RAG/library_rag/src/library_rag/configs/config.yml b/examples/RAG/library_rag/src/library_rag/configs/config.yml
deleted file mode 100644
index d4bd4351b..000000000
--- a/examples/RAG/library_rag/src/library_rag/configs/config.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-general:
-  use_uvloop: true
-
-
-functions:
-  library_rag_tool:
-    _type: library_rag
-    base_url: "http://localhost:8081"
-    reranker_top_k: 2
-    vdb_top_k: 10
-    vdb_endpoint: "http://milvus:19530"
-    collection_names: ["cuda"]
-    enable_query_rewriting: True
-    enable_reranker: True
-
-    #description: Retrieve documents given the input query
-
-llms:
-  nim_llm:
-    _type: nim
-    model_name: meta/llama-3.3-70b-instruct
-    temperature: 0
-    max_tokens: 4096
-    top_p: 1
-
-workflow:
-  _type: tool_calling_agent
-  tool_names:
-   - library_rag_tool
-  llm_name: nim_llm
-  verbose: true
\ No newline at end of file
diff --git a/examples/RAG/library_rag/src/library_rag/library_rag_function.py b/examples/RAG/library_rag/src/library_rag/library_rag_function.py
deleted file mode 100644
index b9c66576e..000000000
--- a/examples/RAG/library_rag/src/library_rag/library_rag_function.py
+++ /dev/null
@@ -1,82 +0,0 @@
-import logging
-
-from pydantic import Field
-
-from nat.builder.builder import Builder
-from nat.builder.function_info import FunctionInfo
-from nat.cli.register_workflow import register_function
-from nat.data_models.function import FunctionBaseConfig
-
-logger = logging.getLogger(__name__)
-
-
-class LibraryRagFunctionConfig(FunctionBaseConfig, name="library_rag"):
-    """
-    NAT function template. Please update the description.
-    """
-    base_url: str = Field(description="Local / Custom RAG URL")
-    #prompt: str = Field(default="Hello", description="The prompt")
-    reranker_top_k: int = Field(default=2, description="Maximum number of records to be retrieved") #TODO: Modify the descriptions
-    vdb_top_k: int = Field(default=10, description="Maximum number of records to be retrieved")
-    vdb_endpoint: str = Field(default="", description="Maximum number of records to be retrieved")
-    collection_names: list[str] = Field(default="1", description="Maximum number of records to be retrieved")
-    enable_query_rewriting: bool = Field(default=True, description="Maximum number of records to be retrieved")
-    enable_reranker: bool = Field(default=True, description="Maximum number of records to be retrieved")
-
-@register_function(config_type=LibraryRagFunctionConfig)
-def library_rag_function(
-    config: LibraryRagFunctionConfig, builder: Builder
-):
-    import aiohttp
-    # Implement your function logic here
-    async def _response_fn(query: str) -> str:
-        url = f"{config.base_url}/v1/search"
-        payload={
-        "query": f"{query}",
-        "reranker_top_k": f"{config.reranker_top_k}",
-        "vdb_top_k": f"{config.vdb_top_k}",
-        "vdb_endpoint": f"{config.vdb_endpoint}",
-        "collection_names": f"{config.collection_names}", # Multiple collection retrieval can be used by passing multiple collection names
-        "enable_query_rewriting": f"{config.enable_query_rewriting}",
-        "enable_reranker": f"{config.enable_reranker}",}
-
-        logger.info("Your query is %s", query)
-
-
-        async with aiohttp.ClientSession() as session:
-            try:
-                logger.debug("Sending request to the RAG endpoint %s", url)
-
-                #async with session.post(url=url, json=payload) as response:
-                results = await session.post(url=url, json=payload).json()
-
-                logger.info("The results are %s", results)
-
-                if results["total_results"] == 0:
-                    yield ""
-
-                # parse docs from LangChain/LangGraph Document object to string
-                parsed_docs = []
-
-                # iterate over results and store parsed content
-
-                num_records = results["total_results"]
-                records = results["results"]
-                for i in range(num_records):
-                    document_id = records[i]["document_id"]
-                    content = records[i]["content"]
-                    parsed_document = f'<Document"/> document_id={document_id}\n"{content}\n</Document>'
-                    parsed_docs.append(parsed_document)
-
-                # combine parsed documents into a single string
-                internal_search_docs = "\n\n---\n\n".join(parsed_docs)
-                yield internal_search_docs
-
-
-            except aiohttp.ClientError as e:
-                print(f"Error: {e}")
-
-        yield FunctionInfo.from_fn(
-                _response_fn,
-                description=("This tool retrieves relevant documents for a given user query."
-                            "This will return relevant documents from the selected collection."))
diff --git a/examples/RAG/library_rag/tests/__init__.py b/examples/RAG/library_rag/tests/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/RAG/library_rag/tests/conftest.py b/examples/RAG/library_rag/tests/conftest.py
deleted file mode 100644
index 3dab6d46c..000000000
--- a/examples/RAG/library_rag/tests/conftest.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import pytest
-import asyncio
-from unittest.mock import Mock, AsyncMock
-from aiohttp import ClientSession
-from aioresponses import aioresponses
-
-from library_rag.library_rag_function import LibraryRagFunctionConfig
-from nat.builder.builder import Builder
-
-
-@pytest.fixture
-def event_loop():
-    """Create an instance of the default event loop for the test session."""
-    loop = asyncio.get_event_loop_policy().new_event_loop()
-    yield loop
-    loop.close()
-
-
-@pytest.fixture
-def mock_builder():
-    """Mock Builder instance for testing."""
-    return Mock(spec=Builder)
-
-
-@pytest.fixture
-def default_config():
-    """Default configuration for testing."""
-    return LibraryRagFunctionConfig(
-        base_url="http://localhost:8081",
-        reranker_top_k=2,
-        vdb_top_k=10,
-        vdb_endpoint="http://milvus:19530",
-        collection_names=["test_collection"],
-        enable_query_rewriting=True,
-        enable_reranker=True
-    )
-
-
-@pytest.fixture
-def minimal_config():
-    """Minimal configuration for testing."""
-    return LibraryRagFunctionConfig(
-        base_url="http://localhost:8081"
-    )
-
-
-@pytest.fixture
-def sample_rag_response():
-    """Sample RAG API response."""
-    return {
-        "total_results": 2,
-        "results": [
-            {
-                "document_id": "doc_1",
-                "content": "This is the first document content about CUDA programming."
-            },
-            {
-                "document_id": "doc_2", 
-                "content": "This is the second document content about GPU acceleration."
-            }
-        ]
-    }
-
-
-@pytest.fixture
-def empty_rag_response():
-    """Empty RAG API response."""
-    return {
-        "total_results": 0,
-        "results": []
-    }
-
-
-@pytest.fixture
-def mock_aiohttp_session():
-    """Mock aiohttp session for testing."""
-    with aioresponses() as m:
-        yield m
diff --git a/examples/RAG/library_rag/tests/test_configs/test_config.yml b/examples/RAG/library_rag/tests/test_configs/test_config.yml
deleted file mode 100644
index b76572d8c..000000000
--- a/examples/RAG/library_rag/tests/test_configs/test_config.yml
+++ /dev/null
@@ -1,28 +0,0 @@
-general:
-  use_uvloop: true
-
-functions:
-  library_rag_tool:
-    _type: library_rag
-    base_url: "http://localhost:8081"
-    reranker_top_k: 3
-    vdb_top_k: 15
-    vdb_endpoint: "http://test-milvus:19530"
-    collection_names: ["test_collection_1", "test_collection_2"]
-    enable_query_rewriting: False
-    enable_reranker: True
-
-llms:
-  test_llm:
-    _type: nim
-    model_name: meta/llama-3.3-70b-instruct
-    temperature: 0.1
-    max_tokens: 2048
-    top_p: 0.9
-
-workflow:
-  _type: tool_calling_agent
-  tool_names:
-   - library_rag_tool
-  llm_name: test_llm
-  verbose: false
diff --git a/examples/RAG/library_rag/tests/test_integration.py b/examples/RAG/library_rag/tests/test_integration.py
deleted file mode 100644
index 99e49f8f1..000000000
--- a/examples/RAG/library_rag/tests/test_integration.py
+++ /dev/null
@@ -1,112 +0,0 @@
-import pytest
-import asyncio
-import aiohttp
-import os
-import json
-from unittest.mock import Mock, patch
-import yaml
-from pathlib import Path
-
-from library_rag.library_rag_function import library_rag_function, LibraryRagFunctionConfig
-from nat.builder.builder import Builder
-
-
-class TestIntegration:
-    """Integration tests for the library RAG function."""
-
-    async def fetch_health_status(self, rag_endpoint, check_dependencies=True) -> bool:
-        url = f"{rag_endpoint}/v1/health"
-        params = {"check_dependencies": str(check_dependencies)}
-        
-        async with aiohttp.ClientSession() as session:
-            async with session.get(url, params=params) as response:
-                response.raise_for_status()
-                result = await response.json()
-        
-        if result.get("message") == "Service is up":
-            return True
-        else:
-            print("Basic health check failed")
-            return False
-        
-    
-    @pytest.fixture
-    def config_from_yaml(self):
-        """Load configuration from YAML file for integration testing."""
-        # Load the actual config file
-        config_path = Path(__file__).parent.parent / "src" / "library_rag" / "configs" / "config.yml"
-        
-        with open(config_path, 'r') as f:
-            yaml_config = yaml.safe_load(f)
-        
-        # Extract the function config
-        function_config = yaml_config['functions']['library_rag_tool']
-        
-        # Remove the _type field as it's not part of our config model
-        function_config.pop('_type', None)
-        
-        return LibraryRagFunctionConfig(**function_config)
-
-
-    @pytest.mark.asyncio
-    async def test_end_to_end_workflow_mock(self, config_from_yaml):
-        """Test end-to-end workflow with mocked HTTP responses."""
-        mock_builder = Mock(spec=Builder)
-        
-        # Mock a realistic RAG response
-        mock_response = {
-            "total_results": 2,
-            "results": [
-                {
-                    "document_id": "cuda_guide_1",
-                    "content": "CUDA (Compute Unified Device Architecture) is a parallel computing platform and API model created by NVIDIA."
-                },
-                {
-                    "document_id": "cuda_guide_2", 
-                    "content": "CUDA allows software developers to use a CUDA-enabled graphics processing unit for general purpose processing."
-                }
-            ]
-        }
-        
-        from aioresponses import aioresponses
-        
-        url = f"{config_from_yaml.base_url}/v1/search"
-        
-        with aioresponses() as mock_http:
-            mock_http.post(url, payload=mock_response)
-            
-            # Initialize the function
-            result_generator = library_rag_function(config_from_yaml, mock_builder)
-            
-            # Get the response function
-            response_function = None
-            async for item in result_generator:
-                if hasattr(item, 'fn'):
-                    response_function = item.fn
-                    break
-            
-            assert response_function is not None
-            
-            # Test a realistic query
-            query = "What is CUDA and how does it work?"
-            
-            results = []
-            async for result in response_function(query):
-                results.append(result)
-            
-            # Verify the response structure
-            assert len(results) >= 1
-            document_content = results[0]
-            
-            # Check that both documents are included
-            assert "cuda_guide_1" in document_content
-            assert "cuda_guide_2" in document_content
-            assert "CUDA (Compute Unified Device Architecture)" in document_content
-            assert "graphics processing unit" in document_content
-            
-            # Check proper formatting
-            assert document_content.count("<Document/>") == 2
-            assert document_content.count("</Document>") == 2
-            assert "\n\n---\n\n" in document_content
-
-    
\ No newline at end of file
diff --git a/examples/rag_lib/README.md b/examples/rag_lib/README.md
new file mode 100644
index 000000000..c0db83700
--- /dev/null
+++ b/examples/rag_lib/README.md
@@ -0,0 +1,80 @@
+# NVIDIA RAG Python Package Usage Guide
+
+This guide demonstrates how to use a NAT agent with the NVIDIA RAG Python client as a tool.
+# Get Started With NVIDIA RAG Blueprint
+
+Clone the RAG repo from here: https://github.com/NVIDIA-AI-Blueprints/rag
+
+Install the RAG Library using one of the following options:
+
+# (Option 1) Build the wheel from source and install the Nvidia RAG wheel
+uv build
+uv pip install dist/nvidia_rag-2.2.1-py3-none-any.whl[all]
+
+# (Option 2) Install the package in editable (development) mode from source
+uv pip install -e .[all]
+
+# (Option 3) Install the prebuilt wheel file from pypi. This does not require you to clone the repo.
+uv pip install nvidia-rag[all]
+
+Open the library usage guide in this notebook https://github.com/NVIDIA-AI-Blueprints/rag/blob/main/notebooks/rag_library_usage.ipynb and follow the steps to deploy your RAG server and ingest your documents (skip the installation steps as we have already installed the library)
+
+An example file that you can ingest is provided under `nemo-agent-toolkit/examples/rag_lib/data/cuda.txt`
+
+#### Configure Your Agent
+
+Configure your Agent to use the Milvus collections for RAG. We have pre-configured a configuration file for you in `examples/RAG/simple_rag/configs/milvus_rag_config.yml`. You can modify this file to point to your Milvus instance and collections or add tools to your agent. The agent, by default, is a `tool_calling` agent that can be used to interact with the retriever component. The configuration file is shown below. You can also modify your agent to be another one of the NeMo Agent toolkit pre-built agent implementations such as the `react_agent`
+
+    ```yaml
+    general:
+    use_uvloop: true
+
+
+      functions:
+      rag_tool:
+        _type: rag_lib
+        base_url: "http://localhost:19530"
+        vdb_top_k: 20
+        reranker_top_k: 10
+        collection_names: ["test_library"]
+        topic: Retrieve relevant documents from the database relevant to the query
+
+
+    llms:
+      nim_llm:
+        _type: nim
+        model_name: meta/llama-3.3-70b-instruct
+        temperature: 0
+        max_tokens: 4096
+        top_p: 1
+
+
+    workflow:
+      _type: tool_calling_agent
+      tool_names:
+      - rag_tool
+      verbose: true
+      llm_name: nim_llm
+    ```
+
+    If you have a different Milvus instance or collection names, you can modify the `vdb_url` and the `collection_names` in the config file to point to your instance and collections. 
+    You can also modify the retrieval parameters like `vdb_top_k`, ...
+    You can also add additional functions as tools for your agent in the `functions` section.
+
+#### Install the Workflow
+```bash
+uv pip install -e examples/rag_lib
+```
+
+#### Run the Workflow
+
+```bash
+nat run --config_file examples/rag_lib/src/rag_lib/configs/config.yml --input "How do I install CUDA"
+```
+
+The expected workflow result of running the above command is:
+```console
+['To install CUDA, you typically need to: \n1. Verify you have a CUDA-capable GPU and a supported version of your operating system.\n2. Download the NVIDIA CUDA Toolkit from the official NVIDIA website.\n3. Choose an installation method, such as a local repository installation or a network repository installation, depending on your system.\n4. Follow the specific instructions for your operating system, which may include installing local repository packages, enabling network repositories, or running installer scripts.\n5. Reboot your system and perform post-installation actions, such as setting up your environment and verifying the installation by running sample projects. \n\nPlease refer to the official NVIDIA CUDA documentation for detailed instructions tailored to your specific operating system and distribution.']
+
+
+
diff --git a/examples/RAG/library_rag/data/cuda.txt b/examples/rag_lib/data/cuda.txt
similarity index 100%
rename from examples/RAG/library_rag/data/cuda.txt
rename to examples/rag_lib/data/cuda.txt
diff --git a/examples/rag_library_mode/pyproject.toml b/examples/rag_lib/pyproject.toml
similarity index 79%
rename from examples/rag_library_mode/pyproject.toml
rename to examples/rag_lib/pyproject.toml
index cab960dea..e64b296db 100644
--- a/examples/rag_library_mode/pyproject.toml
+++ b/examples/rag_lib/pyproject.toml
@@ -9,10 +9,10 @@ git_describe_command = "git describe --long --first-parent"
 root = "../.."
 
 [project]
-name = "rag_library_mode"
+name = "rag_lib"
 dynamic = ["version"]
 dependencies = [
-  "nvidia-nat[langchain]~=1.3",
+  "nvidia-nat[langchain]~=0.1",
 ]
 requires-python = ">=3.11,<3.13"
 description = "Custom NeMo Agent Toolkit Workflow"
@@ -21,8 +21,5 @@ classifiers = ["Programming Language :: Python"]
 [tool.uv.sources]
 nvidia-nat = { path = "../..", editable = true }
 
-[project.entry-points.'nat.plugins']
-rag_library_mode = "rag_library_mode.register"
-
 [project.entry-points.'nat.components']
-rag_library_mode = "rag_library_mode.register"
\ No newline at end of file
+rag_lib = "rag_lib.register"
\ No newline at end of file
diff --git a/examples/RAG/library_rag/src/library_rag/__init__.py b/examples/rag_lib/src/rag_lib/__init__.py
similarity index 100%
rename from examples/RAG/library_rag/src/library_rag/__init__.py
rename to examples/rag_lib/src/rag_lib/__init__.py
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml b/examples/rag_lib/src/rag_lib/configs/config.yml
similarity index 79%
rename from examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml
rename to examples/rag_lib/src/rag_lib/configs/config.yml
index 768c99859..caa70cc89 100644
--- a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/configs/config.yml
+++ b/examples/rag_lib/src/rag_lib/configs/config.yml
@@ -4,11 +4,11 @@ general:
 
 functions:
   rag_tool:
-    _type: rag_library_mode
+    _type: rag_lib
     base_url: "http://localhost:19530"
-    vdb_top_k: 10
-    reranker_top_k: 100
-    collection_names: ["cuda_docs"]
+    vdb_top_k: 20
+    reranker_top_k: 10
+    collection_names: ["test_library"]
     topic: Retrieve relevant documents from the database relevant to the query
 
 
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py b/examples/rag_lib/src/rag_lib/rag_lib_function.py
similarity index 81%
rename from examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
rename to examples/rag_lib/src/rag_lib/rag_lib_function.py
index 5f374f4ab..d6817184c 100644
--- a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
+++ b/examples/rag_lib/src/rag_lib/rag_lib_function.py
@@ -17,7 +17,7 @@
 logger = logging.getLogger(__name__)
 
 
-class RagLibraryModeFunctionConfig(FunctionBaseConfig, name="rag_library_mode"):
+class RagLibFunctionConfig(FunctionBaseConfig, name="rag_lib"):
     """
     This tool retrieves relevant documents for a given user query. The input query is mapped to the most appropriate
     Milvus collection database. This will return relevant documents from the selected collection.
@@ -30,9 +30,9 @@ class RagLibraryModeFunctionConfig(FunctionBaseConfig, name="rag_library_mode"):
     
 
 
-@register_function(config_type=RagLibraryModeFunctionConfig)
-async def rag_library_mode_function(
-    config: RagLibraryModeFunctionConfig, builder: Builder
+@register_function(config_type=RagLibFunctionConfig)
+async def rag_lib_function(
+    config: RagLibFunctionConfig, builder: Builder
 ):
 
     def parse_search_citations(citations):
@@ -54,14 +54,6 @@ async def _response_fn(query: str) -> str:
         # Process the input_message and generate output
 
         rag = NvidiaRAG()
-        ingestor = NvidiaRAGIngestor()
-
-        # Just to debug
-        response = ingestor.get_documents(
-        collection_name=config.collection_names,
-        vdb_endpoint=config.base_url,
-        )
-        logger.info(f"***** {response}")
         
         return parse_search_citations(rag.search(
             query=f"{query}",
@@ -75,4 +67,4 @@ async def _response_fn(query: str) -> str:
     except GeneratorExit:
         logger.warning("Function exited early!")
     finally:
-        logger.info("Cleaning up rag_library_mode workflow.")
\ No newline at end of file
+        logger.info("Cleaning up rag_lib_mode workflow.")
\ No newline at end of file
diff --git a/examples/RAG/library_rag/src/library_rag/register.py b/examples/rag_lib/src/rag_lib/register.py
similarity index 65%
rename from examples/RAG/library_rag/src/library_rag/register.py
rename to examples/rag_lib/src/rag_lib/register.py
index cd7276447..0bff1ab8c 100644
--- a/examples/RAG/library_rag/src/library_rag/register.py
+++ b/examples/rag_lib/src/rag_lib/register.py
@@ -1,4 +1,4 @@
 # flake8: noqa
 
 # Import any tools which need to be automatically registered here
-from library_rag import library_rag_function
\ No newline at end of file
+from rag_lib import rag_lib_function
\ No newline at end of file
diff --git a/examples/rag_library_mode/rag_library_mode/pyproject.toml b/examples/rag_library_mode/rag_library_mode/pyproject.toml
deleted file mode 100644
index cab960dea..000000000
--- a/examples/rag_library_mode/rag_library_mode/pyproject.toml
+++ /dev/null
@@ -1,28 +0,0 @@
-[build-system]
-build-backend = "setuptools.build_meta"
-requires = ["setuptools >= 64", "setuptools-scm>=8"]
-
-[tool.setuptools_scm]
-# NAT uses the --first-parent flag to avoid tags from previous releases which have been merged into the develop branch
-# from causing an unexpected version change. This can be safely removed if developing outside of the NAT repository.
-git_describe_command = "git describe --long --first-parent"
-root = "../.."
-
-[project]
-name = "rag_library_mode"
-dynamic = ["version"]
-dependencies = [
-  "nvidia-nat[langchain]~=1.3",
-]
-requires-python = ">=3.11,<3.13"
-description = "Custom NeMo Agent Toolkit Workflow"
-classifiers = ["Programming Language :: Python"]
-
-[tool.uv.sources]
-nvidia-nat = { path = "../..", editable = true }
-
-[project.entry-points.'nat.plugins']
-rag_library_mode = "rag_library_mode.register"
-
-[project.entry-points.'nat.components']
-rag_library_mode = "rag_library_mode.register"
\ No newline at end of file
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/__init__.py b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
deleted file mode 100644
index f13ab4a83..000000000
--- a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
+++ /dev/null
@@ -1,211 +0,0 @@
-services:
-
-  # Main ingestor server which is responsible for ingestion
-  ingestor-server:
-    container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../
-      dockerfile: ./src/nvidia_rag/ingestor_server/Dockerfile
-    # start the server on port 8082 with 4 workers for improved latency on concurrent requests.
-    command: --port 8082 --host 0.0.0.0 --workers 1
-
-    volumes:
-      # Mount the prompt.yaml file to the container, path should be absolute
-      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
-
-    # Common customizations to the pipeline can be controlled using env variables
-    environment:
-      # Path to example directory relative to root
-      EXAMPLE_PATH: 'src/nvidia_rag/ingestor_server'
-
-      # Absolute path to custom prompt.yaml file
-      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
-
-      ##===Vector DB specific configurations===
-      # URL on which vectorstore is hosted
-      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
-      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
-      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
-      # Type of vectordb search to be used
-      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
-      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
-      # Boolean to control GPU search for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
-      # vectorstore collection name to store embeddings
-      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
-
-      ##===MINIO specific configurations===
-      MINIO_ENDPOINT: "minio:9010"
-      MINIO_ACCESSKEY: "minioadmin"
-      MINIO_SECRETKEY: "minioadmin"
-
-      NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-
-      ##===Embedding Model specific configurations===
-      # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
-      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
-      APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
-
-      ##===NV-Ingest Connection Configurations=======
-      APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
-      APP_NVINGEST_MESSAGECLIENTPORT: ${APP_NVINGEST_MESSAGECLIENTPORT:-7670}
-
-      ##===NV-Ingest Extract Configurations==========
-      APP_NVINGEST_EXTRACTTEXT: ${APP_NVINGEST_EXTRACTTEXT:-True}
-      APP_NVINGEST_EXTRACTINFOGRAPHICS: ${APP_NVINGEST_EXTRACTINFOGRAPHICS:-False}
-      APP_NVINGEST_EXTRACTTABLES: ${APP_NVINGEST_EXTRACTTABLES:-True}
-      APP_NVINGEST_EXTRACTCHARTS: ${APP_NVINGEST_EXTRACTCHARTS:-True}
-      APP_NVINGEST_EXTRACTIMAGES: ${APP_NVINGEST_EXTRACTIMAGES:-False}
-      APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
-      APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
-      APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
-      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretriever_parse, None
-      # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
-      APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
-
-      ##===NV-Ingest Splitting Configurations========
-      APP_NVINGEST_CHUNKSIZE: ${APP_NVINGEST_CHUNKSIZE:-512}
-      APP_NVINGEST_CHUNKOVERLAP: ${APP_NVINGEST_CHUNKOVERLAP:-150}
-      APP_NVINGEST_ENABLEPDFSPLITTER: ${APP_NVINGEST_ENABLEPDFSPLITTER:-True}
-      APP_NVINGEST_SEGMENTAUDIO: ${APP_NVINGEST_SEGMENTAUDIO:-False} # Enable audio segmentation for NV Ingest
-
-      ##===NV-Ingest Caption Model configurations====
-      APP_NVINGEST_CAPTIONMODELNAME: ${APP_NVINGEST_CAPTIONMODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
-      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1
-      APP_NVINGEST_CAPTIONENDPOINTURL: ${APP_NVINGEST_CAPTIONENDPOINTURL:-"http://vlm-ms:8000/v1/chat/completions"}
-
-      # Choose whether to store the extracted content in the vector store for citation support
-      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
-
-      # Choose the summary model to use for document summary
-      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3_3-nemotron-super-49b-v1_5}
-      SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-"nim-llm:8000"}
-      SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-50000}
-      SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-200}
-      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
-      LOGLEVEL: ${LOGLEVEL:-INFO}
-
-      # [Optional] Redis configuration for task status and result storage
-      REDIS_HOST: ${REDIS_HOST:-redis}
-      REDIS_PORT: ${REDIS_PORT:-6379}
-      REDIS_DB: ${REDIS_DB:-0}
-
-      # Bulk upload to MinIO
-      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
-      TEMP_DIR: ${TEMP_DIR:-/tmp-data}
-
-      # NV-Ingest Batch Mode Configurations
-      NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
-      NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
-
-    ports:
-      - "8082:8082"
-    expose:
-      - "8082"
-    shm_size: 5gb
-
-  redis:
-    image: "redis/redis-stack:7.2.0-v18"
-    ports:
-      - "6379:6379"
-
-  nv-ingest-ms-runtime:
-    image: nvcr.io/nvstaging/nim/nv-ingest:25.8.0-RC6
-    cpuset: "0-15"
-    volumes:
-      - ${DATASET_ROOT:-./data}:/workspace/data
-    ports:
-      # HTTP API
-      - "7670:7670"
-      # Simple Broker
-      - "7671:7671"
-    cap_add:
-      - sys_nice
-    environment:
-      # Audio model not used in this RAG version
-      - AUDIO_GRPC_ENDPOINT=audio:50051
-      - AUDIO_INFER_PROTOCOL=grpc
-      - CUDA_VISIBLE_DEVICES=0
-      - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
-      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}}
-      # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
-      - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}}
-      # - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-vlm-embedding-ms:8000/v1}}
-      - INGEST_LOG_LEVEL=WARNING
-      - INGEST_RAY_LOG_LEVEL=PRODUCTION
-      - INGEST_EDGE_BUFFER_SIZE=64
-      - INGEST_DYNAMIC_MEMORY_THRESHOLD=0.8
-      - INGEST_DISABLE_DYNAMIC_SCALING=${INGEST_DISABLE_DYNAMIC_SCALING:-True}
-      - INSTALL_AUDIO_EXTRACTION_DEPS=true
-      # Message client for development
-      #- MESSAGE_CLIENT_HOST=0.0.0.0
-      #- MESSAGE_CLIENT_PORT=7671
-      #- MESSAGE_CLIENT_TYPE=simple # Configure the ingest service to use the simple broker
-      # Message client for production
-      - MESSAGE_CLIENT_HOST=redis
-      - MESSAGE_CLIENT_PORT=6379
-      - MESSAGE_CLIENT_TYPE=redis
-      - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
-      - MRC_IGNORE_NUMA_CHECK=1
-      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=${NEMORETRIEVER_PARSE_HTTP_ENDPOINT:-http://nemoretriever-parse:8000/v1/chat/completions}
-      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=${NEMORETRIEVER_PARSE_INFER_PROTOCOL:-http}
-      - NEMORETRIEVER_PARSE_MODEL_NAME=${NEMORETRIEVER_PARSE_MODEL_NAME:-nvidia/nemoretriever-parse}
-      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NVIDIA_BUILD_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
-      - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
-      # Self-hosted ocr endpoints.
-      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-${PADDLE_GRPC_ENDPOINT:-paddle:8001}}
-      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-${PADDLE_HTTP_ENDPOINT:-http://paddle:8000/v1/infer}}
-      - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-${PADDLE_INFER_PROTOCOL:-grpc}}
-      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-paddle}
-      # build.nvidia.com hosted ocr endpoints.
-      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
-      #- OCR_INFER_PROTOCOL=http
-      - READY_CHECK_ALL_COMPONENTS=False
-      - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
-      # Self-hosted redis endpoints.
-      # build.nvidia.com hosted yolox endpoints.
-      #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2
-      #- YOLOX_INFER_PROTOCOL=http
-      - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
-      - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
-      - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
-      # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
-      #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
-      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
-      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
-      # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
-      #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
-      - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
-      - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
-      - YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=${YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL:-grpc}
-      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1/chat/completions
-      - VLM_CAPTION_ENDPOINT=${VLM_CAPTION_ENDPOINT:-http://vlm-ms:8000/v1/chat/completions}
-      - VLM_CAPTION_MODEL_NAME=${VLM_CAPTION_MODEL_NAME:-nvidia/llama-3.1-nemotron-nano-vl-8b-v1}
-      - MODEL_PREDOWNLOAD_PATH=${MODEL_PREDOWNLOAD_PATH:-/workspace/models/}
-    healthcheck:
-      test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
-      interval: 10s
-      timeout: 5s
-      retries: 20
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
deleted file mode 100644
index 334623c04..000000000
--- a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
+++ /dev/null
@@ -1,186 +0,0 @@
-services:
-
-  # Main orchestrator server which stiches together all calls to different services to fulfill the user request
-  rag-server:
-    container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../
-      dockerfile: src/nvidia_rag/rag_server/Dockerfile
-    # start the server on port 8081 with 8 workers for improved latency on concurrent requests.
-    command: --port 8081 --host 0.0.0.0 --workers 8
-    volumes:
-      # Mount the prompt.yaml file to the container, path should be absolute
-      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
-    # Common customizations to the pipeline can be controlled using env variables
-    environment:
-      # Path to example directory relative to root
-      EXAMPLE_PATH: './nvidia_rag/rag_server'
-
-      # Absolute path to custom prompt.yaml file
-      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
-
-      ##===MINIO specific configurations which is used to store the multimodal base64 content===
-      MINIO_ENDPOINT: "minio:9010"
-      MINIO_ACCESSKEY: "minioadmin"
-      MINIO_SECRETKEY: "minioadmin"
-
-      ##===Vector DB specific configurations===
-      # URL on which vectorstore is hosted
-      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
-      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
-      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
-      # Type of index to be used for vectorstore
-      APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
-      # Type of vectordb search to be used
-      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
-      # vectorstore collection name to store embeddings
-      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
-      APP_RETRIEVER_SCORETHRESHOLD: 0.25
-      # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
-      VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
-
-      ##===LLM Model specific configurations===
-      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # url on which llm model is hosted. If "", Nvidia hosted API is used
-      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
-
-      ##===Query Rewriter Model specific configurations===
-      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
-      # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
-      APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b:8000"}
-
-      ##===Filter Expression Generator Model specific configurations===
-      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
-      APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
-      # enable filter expression generator for natural language to filter expression conversion
-      ENABLE_FILTER_GENERATOR: ${ENABLE_FILTER_GENERATOR:-False}
-
-      ##===Embedding Model specific configurations===
-      # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
-      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
-
-      ##===Reranking Model specific configurations===
-      # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
-      ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
-      # Default confidence threshold for filtering documents by reranker relevance scores (0.0 to 1.0)
-      RERANKER_CONFIDENCE_THRESHOLD: ${RERANKER_CONFIDENCE_THRESHOLD:-0.0}
-
-      ##===VLM Model specific configurations===
-      ENABLE_VLM_INFERENCE: ${ENABLE_VLM_INFERENCE:-false}
-      # Reasoning gate on VLM response: off by default; enable to mitigate incorrect VLM outputs
-      ENABLE_VLM_RESPONSE_REASONING: ${ENABLE_VLM_RESPONSE_REASONING:-false}
-      # Max images sent to VLM per request (query + context)
-      APP_VLM_MAX_TOTAL_IMAGES: ${APP_VLM_MAX_TOTAL_IMAGES:-4}
-      # Max number of query images to include in VLM input
-      APP_VLM_MAX_QUERY_IMAGES: ${APP_VLM_MAX_QUERY_IMAGES:-1}
-      # Max number of context images to include in VLM input
-      APP_VLM_MAX_CONTEXT_IMAGES: ${APP_VLM_MAX_CONTEXT_IMAGES:-1}
-      APP_VLM_SERVERURL: ${APP_VLM_SERVERURL-"http://vlm-ms:8000/v1"}
-      APP_VLM_MODELNAME: ${APP_VLM_MODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
-
-      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-
-      # Number of document chunks to insert in LLM prompt, used only when ENABLE_RERANKER is set to True
-      APP_RETRIEVER_TOPK: ${APP_RETRIEVER_TOPK:-10}
-
-      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
-      LOGLEVEL: ${LOGLEVEL:-INFO}
-
-      # enable multi-turn conversation in the rag chain - this controls conversation history usage
-      # while doing query rewriting and in LLM prompt
-      ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
-
-      # enable query rewriting for multiturn conversation in the rag chain.
-      # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
-      ENABLE_QUERYREWRITER: ${ENABLE_QUERYREWRITER:-False}
-
-      # Choose whether to enable citations in the response
-      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
-
-      # Choose whether to enable/disable guardrails
-      ENABLE_GUARDRAILS: ${ENABLE_GUARDRAILS:-False}
-
-      # NeMo Guardrails URL when ENABLE_GUARDRAILS is true
-      NEMO_GUARDRAILS_URL: ${NEMO_GUARDRAILS_URL:-nemo-guardrails-microservice:7331}
-
-      # number of last n chat messages to consider from the provided conversation history
-      CONVERSATION_HISTORY: 5
-
-      # Tracing
-      APP_TRACING_ENABLED: "False"
-      # HTTP endpoint
-      APP_TRACING_OTLPHTTPENDPOINT: http://otel-collector:4318/v1/traces
-      # GRPC endpoint
-      APP_TRACING_OTLPGRPCENDPOINT: grpc://otel-collector:4317
-
-      # Choose whether to enable source metadata in document content during generation
-      ENABLE_SOURCE_METADATA: ${ENABLE_SOURCE_METADATA:-true}
-
-      # Whether to filter content within <think></think> tags in model responses
-      FILTER_THINK_TOKENS: ${FILTER_THINK_TOKENS:-true}
-
-      # Whether to enable thinking in the rag chain for llama-3.3-nemotron-super-49b model
-      ENABLE_NEMOTRON_THINKING: ${ENABLE_NEMOTRON_THINKING:-false}
-
-      # enable reflection (context relevance and response groundedness checking) in the rag chain
-      ENABLE_REFLECTION: ${ENABLE_REFLECTION:-false}
-      # Maximum number of context relevance loop iterations
-      MAX_REFLECTION_LOOP: ${MAX_REFLECTION_LOOP:-3}
-      # Minimum relevance score threshold (0-2)
-      CONTEXT_RELEVANCE_THRESHOLD: ${CONTEXT_RELEVANCE_THRESHOLD:-1}
-      # Minimum groundedness score threshold (0-2)
-      RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
-      # reflection llm
-      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # reflection llm server url. If "", Nvidia hosted API is used
-      REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
-      # enable iterative query decomposition
-      ENABLE_QUERY_DECOMPOSITION: ${ENABLE_QUERY_DECOMPOSITION:-false}
-      # maximum recursion depth for iterative query decomposition
-      MAX_RECURSION_DEPTH: ${MAX_RECURSION_DEPTH:-3}
-
-    ports:
-      - "8081:8081"
-    expose:
-      - "8081"
-    shm_size: 5gb
-
-  # Sample UI container which interacts with APIs exposed by rag-server container
-  rag-playground:
-    container_name: rag-playground
-    image: nvcr.io/nvstaging/blueprint/rag-playground:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../frontend
-      dockerfile: ./Dockerfile
-      args:
-        # Environment variables for Vite build
-        VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
-        VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-        VITE_MILVUS_URL: http://milvus:19530
-    ports:
-      - "8090:3000"
-    expose:
-      - "3000"
-    environment:
-      # Runtime environment variables for Vite
-      VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
-      VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-      VITE_MILVUS_URL: http://milvus:19530
-    depends_on:
-      - rag-server
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
deleted file mode 100644
index ed9bf8403..000000000
--- a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-services:
-
-  # Milvus can be made GPU accelerated by uncommenting the lines as specified below
-  milvus:
-    container_name: milvus-standalone
-    image: milvusdb/milvus:${MILVUS_VERSION:-v2.6.0-gpu} # milvusdb/milvus:v2.6.0 for CPU
-    command: ["milvus", "run", "standalone"]
-    environment:
-      ETCD_ENDPOINTS: etcd:2379
-      MINIO_ADDRESS: minio:9010
-      KNOWHERE_GPU_MEM_POOL_SIZE: 2048;4096
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
-    # healthcheck:
-    #   test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
-    #   interval: 30s
-    #   start_period: 90s
-    #   timeout: 20s
-    #   retries: 3
-    ports:
-      - "19530:19530"
-      - "9091:9091"
-    depends_on:
-      - "etcd"
-      - "minio"
-    # Comment out this section if CPU based image is used and set below env variables to False
-    # export APP_VECTORSTORE_ENABLEGPUSEARCH=False
-    # export APP_VECTORSTORE_ENABLEGPUINDEX=False
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: ["gpu"]
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
-    profiles: ["", "milvus"]
-
-  etcd:
-    container_name: milvus-etcd
-    image: quay.io/coreos/etcd:v3.6.4
-    environment:
-      - ETCD_AUTO_COMPACTION_MODE=revision
-      - ETCD_AUTO_COMPACTION_RETENTION=1000
-      - ETCD_QUOTA_BACKEND_BYTES=4294967296
-      - ETCD_SNAPSHOT_COUNT=50000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
-    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
-    healthcheck:
-      test: ["CMD", "etcdctl", "endpoint", "health"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-    profiles: ["", "milvus"]
-
-  minio:
-    container_name: milvus-minio
-    image: minio/minio:RELEASE.2025-07-23T15-54-02Z
-    environment:
-      MINIO_ACCESS_KEY: minioadmin
-      MINIO_SECRET_KEY: minioadmin
-    ports:
-      - "9011:9011"
-      - "9010:9010"
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
-    command: minio server /minio_data --console-address ":9011" --address ":9010"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-    profiles: ["", "milvus", "elasticsearch", "minio"]
-
-  elasticsearch:
-    container_name: elasticsearch
-    image: "docker.elastic.co/elasticsearch/elasticsearch:9.0.3"
-    ports:
-      - 9200:9200
-    volumes:
-      # Run "sudo chown -R 1000:1000 deploy/compose/volumes/elasticsearch/" to fix permissions
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/elasticsearch:/usr/share/elasticsearch/data
-    restart: on-failure
-    environment:
-      - discovery.type=single-node
-      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
-      - xpack.security.enabled=false
-      - xpack.license.self_generated.type=basic
-      - network.host=0.0.0.0
-      - cluster.routing.allocation.disk.threshold_enabled=false
-    hostname: elasticsearch
-    healthcheck:
-      test: ["CMD", "curl", "-s", "-f", "http://localhost:9200/_cat/health"]
-      interval: 10s
-      timeout: 1s
-      retries: 10
-    profiles: ["elasticsearch"]
-
-networks:
-  default:
-    name: nvidia-rag
\ No newline at end of file
diff --git a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py b/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py
deleted file mode 100644
index cbdb8f3aa..000000000
--- a/examples/rag_library_mode/rag_library_mode/src/rag_library_mode/register.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# flake8: noqa
-
-# Import any tools which need to be automatically registered here
-from rag_library_mode import rag_library_mode_function
\ No newline at end of file
diff --git a/examples/rag_library_mode/src/rag_library_mode/__init__.py b/examples/rag_library_mode/src/rag_library_mode/__init__.py
deleted file mode 100644
index e69de29bb..000000000
diff --git a/examples/rag_library_mode/src/rag_library_mode/configs/config.yml b/examples/rag_library_mode/src/rag_library_mode/configs/config.yml
deleted file mode 100644
index 768c99859..000000000
--- a/examples/rag_library_mode/src/rag_library_mode/configs/config.yml
+++ /dev/null
@@ -1,29 +0,0 @@
-general:
-  use_uvloop: true
-
-
-functions:
-  rag_tool:
-    _type: rag_library_mode
-    base_url: "http://localhost:19530"
-    vdb_top_k: 10
-    reranker_top_k: 100
-    collection_names: ["cuda_docs"]
-    topic: Retrieve relevant documents from the database relevant to the query
-
-
-llms:
-  nim_llm:
-    _type: nim
-    model_name: meta/llama-3.3-70b-instruct
-    temperature: 0
-    max_tokens: 4096
-    top_p: 1
-
-
-workflow:
-  _type: tool_calling_agent
-  tool_names:
-   - rag_tool
-  verbose: true
-  llm_name: nim_llm
diff --git a/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml b/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
deleted file mode 100644
index f13ab4a83..000000000
--- a/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-ingestor-server.yaml
+++ /dev/null
@@ -1,211 +0,0 @@
-services:
-
-  # Main ingestor server which is responsible for ingestion
-  ingestor-server:
-    container_name: ingestor-server
-    image: nvcr.io/nvstaging/blueprint/ingestor-server:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../
-      dockerfile: ./src/nvidia_rag/ingestor_server/Dockerfile
-    # start the server on port 8082 with 4 workers for improved latency on concurrent requests.
-    command: --port 8082 --host 0.0.0.0 --workers 1
-
-    volumes:
-      # Mount the prompt.yaml file to the container, path should be absolute
-      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
-
-    # Common customizations to the pipeline can be controlled using env variables
-    environment:
-      # Path to example directory relative to root
-      EXAMPLE_PATH: 'src/nvidia_rag/ingestor_server'
-
-      # Absolute path to custom prompt.yaml file
-      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
-
-      ##===Vector DB specific configurations===
-      # URL on which vectorstore is hosted
-      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
-      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
-      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
-      # Type of vectordb search to be used
-      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
-      # Boolean to enable GPU index for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUINDEX: ${APP_VECTORSTORE_ENABLEGPUINDEX:-True}
-      # Boolean to control GPU search for milvus vectorstore specific to nvingest
-      APP_VECTORSTORE_ENABLEGPUSEARCH: ${APP_VECTORSTORE_ENABLEGPUSEARCH:-True}
-      # vectorstore collection name to store embeddings
-      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
-
-      ##===MINIO specific configurations===
-      MINIO_ENDPOINT: "minio:9010"
-      MINIO_ACCESSKEY: "minioadmin"
-      MINIO_SECRETKEY: "minioadmin"
-
-      NGC_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-
-      ##===Embedding Model specific configurations===
-      # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
-      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
-      APP_EMBEDDINGS_DIMENSIONS: ${APP_EMBEDDINGS_DIMENSIONS:-2048}
-
-      ##===NV-Ingest Connection Configurations=======
-      APP_NVINGEST_MESSAGECLIENTHOSTNAME: ${APP_NVINGEST_MESSAGECLIENTHOSTNAME:-"nv-ingest-ms-runtime"}
-      APP_NVINGEST_MESSAGECLIENTPORT: ${APP_NVINGEST_MESSAGECLIENTPORT:-7670}
-
-      ##===NV-Ingest Extract Configurations==========
-      APP_NVINGEST_EXTRACTTEXT: ${APP_NVINGEST_EXTRACTTEXT:-True}
-      APP_NVINGEST_EXTRACTINFOGRAPHICS: ${APP_NVINGEST_EXTRACTINFOGRAPHICS:-False}
-      APP_NVINGEST_EXTRACTTABLES: ${APP_NVINGEST_EXTRACTTABLES:-True}
-      APP_NVINGEST_EXTRACTCHARTS: ${APP_NVINGEST_EXTRACTCHARTS:-True}
-      APP_NVINGEST_EXTRACTIMAGES: ${APP_NVINGEST_EXTRACTIMAGES:-False}
-      APP_NVINGEST_EXTRACTPAGEASIMAGE: ${APP_NVINGEST_EXTRACTPAGEASIMAGE:-False}
-      APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY: ${APP_NVINGEST_STRUCTURED_ELEMENTS_MODALITY:-""} # Select from "image", "text_image"
-      APP_NVINGEST_IMAGE_ELEMENTS_MODALITY: ${APP_NVINGEST_IMAGE_ELEMENTS_MODALITY:-""} # Select from "image"
-      APP_NVINGEST_PDFEXTRACTMETHOD: ${APP_NVINGEST_PDFEXTRACTMETHOD:-None} # Select from pdfium, nemoretriever_parse, None
-      # Extract text by "page" only recommended for documents with pages like .pdf, .docx, etc.
-      APP_NVINGEST_TEXTDEPTH: ${APP_NVINGEST_TEXTDEPTH:-page} # extract by "page" or "document"
-
-      ##===NV-Ingest Splitting Configurations========
-      APP_NVINGEST_CHUNKSIZE: ${APP_NVINGEST_CHUNKSIZE:-512}
-      APP_NVINGEST_CHUNKOVERLAP: ${APP_NVINGEST_CHUNKOVERLAP:-150}
-      APP_NVINGEST_ENABLEPDFSPLITTER: ${APP_NVINGEST_ENABLEPDFSPLITTER:-True}
-      APP_NVINGEST_SEGMENTAUDIO: ${APP_NVINGEST_SEGMENTAUDIO:-False} # Enable audio segmentation for NV Ingest
-
-      ##===NV-Ingest Caption Model configurations====
-      APP_NVINGEST_CAPTIONMODELNAME: ${APP_NVINGEST_CAPTIONMODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
-      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1
-      APP_NVINGEST_CAPTIONENDPOINTURL: ${APP_NVINGEST_CAPTIONENDPOINTURL:-"http://vlm-ms:8000/v1/chat/completions"}
-
-      # Choose whether to store the extracted content in the vector store for citation support
-      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
-
-      # Choose the summary model to use for document summary
-      SUMMARY_LLM: ${SUMMARY_LLM:-nvidia/llama-3_3-nemotron-super-49b-v1_5}
-      SUMMARY_LLM_SERVERURL: ${SUMMARY_LLM_SERVERURL-"nim-llm:8000"}
-      SUMMARY_LLM_MAX_CHUNK_LENGTH: ${SUMMARY_LLM_MAX_CHUNK_LENGTH:-50000}
-      SUMMARY_CHUNK_OVERLAP: ${SUMMARY_CHUNK_OVERLAP:-200}
-      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
-      LOGLEVEL: ${LOGLEVEL:-INFO}
-
-      # [Optional] Redis configuration for task status and result storage
-      REDIS_HOST: ${REDIS_HOST:-redis}
-      REDIS_PORT: ${REDIS_PORT:-6379}
-      REDIS_DB: ${REDIS_DB:-0}
-
-      # Bulk upload to MinIO
-      ENABLE_MINIO_BULK_UPLOAD: ${ENABLE_MINIO_BULK_UPLOAD:-True}
-      TEMP_DIR: ${TEMP_DIR:-/tmp-data}
-
-      # NV-Ingest Batch Mode Configurations
-      NV_INGEST_FILES_PER_BATCH: ${NV_INGEST_FILES_PER_BATCH:-16}
-      NV_INGEST_CONCURRENT_BATCHES: ${NV_INGEST_CONCURRENT_BATCHES:-4}
-
-    ports:
-      - "8082:8082"
-    expose:
-      - "8082"
-    shm_size: 5gb
-
-  redis:
-    image: "redis/redis-stack:7.2.0-v18"
-    ports:
-      - "6379:6379"
-
-  nv-ingest-ms-runtime:
-    image: nvcr.io/nvstaging/nim/nv-ingest:25.8.0-RC6
-    cpuset: "0-15"
-    volumes:
-      - ${DATASET_ROOT:-./data}:/workspace/data
-    ports:
-      # HTTP API
-      - "7670:7670"
-      # Simple Broker
-      - "7671:7671"
-    cap_add:
-      - sys_nice
-    environment:
-      # Audio model not used in this RAG version
-      - AUDIO_GRPC_ENDPOINT=audio:50051
-      - AUDIO_INFER_PROTOCOL=grpc
-      - CUDA_VISIBLE_DEVICES=0
-      - MAX_INGEST_PROCESS_WORKERS=${MAX_INGEST_PROCESS_WORKERS:-16}
-      - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}}
-      # Incase of self-hosted embedding model, use the endpoint url as - https://integrate.api.nvidia.com/v1
-      - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-embedding-ms:8000/v1}}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # - EMBEDDING_NIM_MODEL_NAME=${EMBEDDING_NIM_MODEL_NAME:-${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}}
-      # - EMBEDDING_NIM_ENDPOINT=${EMBEDDING_NIM_ENDPOINT:-${APP_EMBEDDINGS_SERVERURL-http://nemoretriever-vlm-embedding-ms:8000/v1}}
-      - INGEST_LOG_LEVEL=WARNING
-      - INGEST_RAY_LOG_LEVEL=PRODUCTION
-      - INGEST_EDGE_BUFFER_SIZE=64
-      - INGEST_DYNAMIC_MEMORY_THRESHOLD=0.8
-      - INGEST_DISABLE_DYNAMIC_SCALING=${INGEST_DISABLE_DYNAMIC_SCALING:-True}
-      - INSTALL_AUDIO_EXTRACTION_DEPS=true
-      # Message client for development
-      #- MESSAGE_CLIENT_HOST=0.0.0.0
-      #- MESSAGE_CLIENT_PORT=7671
-      #- MESSAGE_CLIENT_TYPE=simple # Configure the ingest service to use the simple broker
-      # Message client for production
-      - MESSAGE_CLIENT_HOST=redis
-      - MESSAGE_CLIENT_PORT=6379
-      - MESSAGE_CLIENT_TYPE=redis
-      - MINIO_BUCKET=${MINIO_BUCKET:-nv-ingest}
-      - MRC_IGNORE_NUMA_CHECK=1
-      - NEMORETRIEVER_PARSE_HTTP_ENDPOINT=${NEMORETRIEVER_PARSE_HTTP_ENDPOINT:-http://nemoretriever-parse:8000/v1/chat/completions}
-      - NEMORETRIEVER_PARSE_INFER_PROTOCOL=${NEMORETRIEVER_PARSE_INFER_PROTOCOL:-http}
-      - NEMORETRIEVER_PARSE_MODEL_NAME=${NEMORETRIEVER_PARSE_MODEL_NAME:-nvidia/nemoretriever-parse}
-      - NVIDIA_API_KEY=${NVIDIA_API_KEY:-nvidiaapikey}
-      - NGC_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NVIDIA_BUILD_API_KEY=${NGC_API_KEY:-nvidiaapikey}
-      - NV_INGEST_MAX_UTIL=${NV_INGEST_MAX_UTIL:-48}
-      - OTEL_EXPORTER_OTLP_ENDPOINT=otel-collector:4317
-      # Self-hosted ocr endpoints.
-      - OCR_GRPC_ENDPOINT=${OCR_GRPC_ENDPOINT:-${PADDLE_GRPC_ENDPOINT:-paddle:8001}}
-      - OCR_HTTP_ENDPOINT=${OCR_HTTP_ENDPOINT:-${PADDLE_HTTP_ENDPOINT:-http://paddle:8000/v1/infer}}
-      - OCR_INFER_PROTOCOL=${OCR_INFER_PROTOCOL:-${PADDLE_INFER_PROTOCOL:-grpc}}
-      - OCR_MODEL_NAME=${OCR_MODEL_NAME:-paddle}
-      # build.nvidia.com hosted ocr endpoints.
-      #- OCR_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/baidu/paddleocr
-      #- OCR_INFER_PROTOCOL=http
-      - READY_CHECK_ALL_COMPONENTS=False
-      - REDIS_MORPHEUS_TASK_QUEUE=morpheus_task_queue
-      # Self-hosted redis endpoints.
-      # build.nvidia.com hosted yolox endpoints.
-      #- YOLOX_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-page-elements-v2
-      #- YOLOX_INFER_PROTOCOL=http
-      - YOLOX_GRPC_ENDPOINT=${YOLOX_GRPC_ENDPOINT:-page-elements:8001}
-      - YOLOX_HTTP_ENDPOINT=${YOLOX_HTTP_ENDPOINT:-http://page-elements:8000/v1/infer}
-      - YOLOX_INFER_PROTOCOL=${YOLOX_INFER_PROTOCOL:-grpc}
-      # build.nvidia.com hosted yolox-graphics-elements endpoints.
-      #- YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-graphic-elements-v1
-      #- YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=http
-      - YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_GRPC_ENDPOINT:-graphic-elements:8001}
-      - YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT=${YOLOX_GRAPHIC_ELEMENTS_HTTP_ENDPOINT:-http://graphic-elements:8000/v1/infer}
-      - YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL=${YOLOX_GRAPHIC_ELEMENTS_INFER_PROTOCOL:-grpc}
-      # build.nvidia.com hosted  yolox-table-elements endpoints.
-      #- YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=https://ai.api.nvidia.com/v1/cv/nvidia/nemoretriever-table-structure-v1
-      #- YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=http
-      - YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT=${YOLOX_TABLE_STRUCTURE_GRPC_ENDPOINT:-table-structure:8001}
-      - YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT=${YOLOX_TABLE_STRUCTURE_HTTP_ENDPOINT:-http://table-structure:8000/v1/infer}
-      - YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL=${YOLOX_TABLE_STRUCTURE_INFER_PROTOCOL:-grpc}
-      # Incase of nvidia-hosted caption model, use the endpoint url as - https://integrate.api.nvidia.com/v1/chat/completions
-      - VLM_CAPTION_ENDPOINT=${VLM_CAPTION_ENDPOINT:-http://vlm-ms:8000/v1/chat/completions}
-      - VLM_CAPTION_MODEL_NAME=${VLM_CAPTION_MODEL_NAME:-nvidia/llama-3.1-nemotron-nano-vl-8b-v1}
-      - MODEL_PREDOWNLOAD_PATH=${MODEL_PREDOWNLOAD_PATH:-/workspace/models/}
-    healthcheck:
-      test: curl --fail http://nv-ingest-ms-runtime:7670/v1/health/ready || exit 1
-      interval: 10s
-      timeout: 5s
-      retries: 20
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml b/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
deleted file mode 100644
index 334623c04..000000000
--- a/examples/rag_library_mode/src/rag_library_mode/deploy/docker-compose-rag-server.yaml
+++ /dev/null
@@ -1,186 +0,0 @@
-services:
-
-  # Main orchestrator server which stiches together all calls to different services to fulfill the user request
-  rag-server:
-    container_name: rag-server
-    image: nvcr.io/nvstaging/blueprint/rag-server:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../
-      dockerfile: src/nvidia_rag/rag_server/Dockerfile
-    # start the server on port 8081 with 8 workers for improved latency on concurrent requests.
-    command: --port 8081 --host 0.0.0.0 --workers 8
-    volumes:
-      # Mount the prompt.yaml file to the container, path should be absolute
-      - ${PROMPT_CONFIG_FILE}:${PROMPT_CONFIG_FILE}
-    # Common customizations to the pipeline can be controlled using env variables
-    environment:
-      # Path to example directory relative to root
-      EXAMPLE_PATH: './nvidia_rag/rag_server'
-
-      # Absolute path to custom prompt.yaml file
-      PROMPT_CONFIG_FILE: ${PROMPT_CONFIG_FILE:-/prompt.yaml}
-
-      ##===MINIO specific configurations which is used to store the multimodal base64 content===
-      MINIO_ENDPOINT: "minio:9010"
-      MINIO_ACCESSKEY: "minioadmin"
-      MINIO_SECRETKEY: "minioadmin"
-
-      ##===Vector DB specific configurations===
-      # URL on which vectorstore is hosted
-      # For custom operators, point to your service (e.g., http://your-custom-vdb:1234)
-      APP_VECTORSTORE_URL: ${APP_VECTORSTORE_URL:-http://milvus:19530}
-      # Type of vectordb used to store embedding. Supported built-ins: "milvus", "elasticsearch".
-      # You can also provide your custom value (e.g., "your_custom_vdb") when you register it in `_get_vdb_op`.
-      APP_VECTORSTORE_NAME: ${APP_VECTORSTORE_NAME:-"milvus"}
-      # Type of index to be used for vectorstore
-      APP_VECTORSTORE_INDEXTYPE: ${APP_VECTORSTORE_INDEXTYPE:-"GPU_CAGRA"}
-      # Type of vectordb search to be used
-      APP_VECTORSTORE_SEARCHTYPE: ${APP_VECTORSTORE_SEARCHTYPE:-"dense"} # Can be dense or hybrid
-      # vectorstore collection name to store embeddings
-      COLLECTION_NAME: ${COLLECTION_NAME:-multimodal_data}
-      APP_RETRIEVER_SCORETHRESHOLD: 0.25
-      # Top K from vector DB, which goes as input to reranker model if enabled, else goes to LLM prompt
-      VECTOR_DB_TOPK: ${VECTOR_DB_TOPK:-100}
-
-      ##===LLM Model specific configurations===
-      APP_LLM_MODELNAME: ${APP_LLM_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # url on which llm model is hosted. If "", Nvidia hosted API is used
-      APP_LLM_SERVERURL: ${APP_LLM_SERVERURL-"nim-llm:8000"}
-
-      ##===Query Rewriter Model specific configurations===
-      APP_QUERYREWRITER_MODELNAME: ${APP_QUERYREWRITER_MODELNAME:-"meta/llama-3.1-8b-instruct"}
-      # url on which query rewriter model is hosted. If "", Nvidia hosted API is used
-      APP_QUERYREWRITER_SERVERURL: ${APP_QUERYREWRITER_SERVERURL-"nim-llm-llama-8b:8000"}
-
-      ##===Filter Expression Generator Model specific configurations===
-      APP_FILTEREXPRESSIONGENERATOR_MODELNAME: ${APP_FILTEREXPRESSIONGENERATOR_MODELNAME:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # url on which filter expression generator model is hosted. If "", Nvidia hosted API is used
-      APP_FILTEREXPRESSIONGENERATOR_SERVERURL: ${APP_FILTEREXPRESSIONGENERATOR_SERVERURL-"nim-llm:8000"}
-      # enable filter expression generator for natural language to filter expression conversion
-      ENABLE_FILTER_GENERATOR: ${ENABLE_FILTER_GENERATOR:-False}
-
-      ##===Embedding Model specific configurations===
-      # url on which embedding model is hosted. If "", Nvidia hosted API is used
-      APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-embedding-ms:8000"}
-      APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nv-embedqa-1b-v2}
-      # For VLM Embedding Model (Nemoretriever-1b-vlm-embed-v1)
-      # APP_EMBEDDINGS_SERVERURL: ${APP_EMBEDDINGS_SERVERURL-"nemoretriever-vlm-embedding-ms:8000"}
-      # APP_EMBEDDINGS_MODELNAME: ${APP_EMBEDDINGS_MODELNAME:-nvidia/llama-3.2-nemoretriever-1b-vlm-embed-v1}
-
-      ##===Reranking Model specific configurations===
-      # url on which ranking model is hosted. If "", Nvidia hosted API is used
-      APP_RANKING_SERVERURL: ${APP_RANKING_SERVERURL-"nemoretriever-ranking-ms:8000"}
-      APP_RANKING_MODELNAME: ${APP_RANKING_MODELNAME:-"nvidia/llama-3.2-nv-rerankqa-1b-v2"}
-      ENABLE_RERANKER: ${ENABLE_RERANKER:-True}
-      # Default confidence threshold for filtering documents by reranker relevance scores (0.0 to 1.0)
-      RERANKER_CONFIDENCE_THRESHOLD: ${RERANKER_CONFIDENCE_THRESHOLD:-0.0}
-
-      ##===VLM Model specific configurations===
-      ENABLE_VLM_INFERENCE: ${ENABLE_VLM_INFERENCE:-false}
-      # Reasoning gate on VLM response: off by default; enable to mitigate incorrect VLM outputs
-      ENABLE_VLM_RESPONSE_REASONING: ${ENABLE_VLM_RESPONSE_REASONING:-false}
-      # Max images sent to VLM per request (query + context)
-      APP_VLM_MAX_TOTAL_IMAGES: ${APP_VLM_MAX_TOTAL_IMAGES:-4}
-      # Max number of query images to include in VLM input
-      APP_VLM_MAX_QUERY_IMAGES: ${APP_VLM_MAX_QUERY_IMAGES:-1}
-      # Max number of context images to include in VLM input
-      APP_VLM_MAX_CONTEXT_IMAGES: ${APP_VLM_MAX_CONTEXT_IMAGES:-1}
-      APP_VLM_SERVERURL: ${APP_VLM_SERVERURL-"http://vlm-ms:8000/v1"}
-      APP_VLM_MODELNAME: ${APP_VLM_MODELNAME:-"nvidia/llama-3.1-nemotron-nano-vl-8b-v1"}
-
-      NVIDIA_API_KEY: ${NGC_API_KEY:?"NGC_API_KEY is required"}
-
-      # Number of document chunks to insert in LLM prompt, used only when ENABLE_RERANKER is set to True
-      APP_RETRIEVER_TOPK: ${APP_RETRIEVER_TOPK:-10}
-
-      # Log level for server, supported level NOTSET, DEBUG, INFO, WARN, ERROR, CRITICAL
-      LOGLEVEL: ${LOGLEVEL:-INFO}
-
-      # enable multi-turn conversation in the rag chain - this controls conversation history usage
-      # while doing query rewriting and in LLM prompt
-      ENABLE_MULTITURN: ${ENABLE_MULTITURN:-True}
-
-      # enable query rewriting for multiturn conversation in the rag chain.
-      # This will improve accuracy of the retrieiver pipeline but increase latency due to an additional LLM call
-      ENABLE_QUERYREWRITER: ${ENABLE_QUERYREWRITER:-False}
-
-      # Choose whether to enable citations in the response
-      ENABLE_CITATIONS: ${ENABLE_CITATIONS:-True}
-
-      # Choose whether to enable/disable guardrails
-      ENABLE_GUARDRAILS: ${ENABLE_GUARDRAILS:-False}
-
-      # NeMo Guardrails URL when ENABLE_GUARDRAILS is true
-      NEMO_GUARDRAILS_URL: ${NEMO_GUARDRAILS_URL:-nemo-guardrails-microservice:7331}
-
-      # number of last n chat messages to consider from the provided conversation history
-      CONVERSATION_HISTORY: 5
-
-      # Tracing
-      APP_TRACING_ENABLED: "False"
-      # HTTP endpoint
-      APP_TRACING_OTLPHTTPENDPOINT: http://otel-collector:4318/v1/traces
-      # GRPC endpoint
-      APP_TRACING_OTLPGRPCENDPOINT: grpc://otel-collector:4317
-
-      # Choose whether to enable source metadata in document content during generation
-      ENABLE_SOURCE_METADATA: ${ENABLE_SOURCE_METADATA:-true}
-
-      # Whether to filter content within <think></think> tags in model responses
-      FILTER_THINK_TOKENS: ${FILTER_THINK_TOKENS:-true}
-
-      # Whether to enable thinking in the rag chain for llama-3.3-nemotron-super-49b model
-      ENABLE_NEMOTRON_THINKING: ${ENABLE_NEMOTRON_THINKING:-false}
-
-      # enable reflection (context relevance and response groundedness checking) in the rag chain
-      ENABLE_REFLECTION: ${ENABLE_REFLECTION:-false}
-      # Maximum number of context relevance loop iterations
-      MAX_REFLECTION_LOOP: ${MAX_REFLECTION_LOOP:-3}
-      # Minimum relevance score threshold (0-2)
-      CONTEXT_RELEVANCE_THRESHOLD: ${CONTEXT_RELEVANCE_THRESHOLD:-1}
-      # Minimum groundedness score threshold (0-2)
-      RESPONSE_GROUNDEDNESS_THRESHOLD: ${RESPONSE_GROUNDEDNESS_THRESHOLD:-1}
-      # reflection llm
-      REFLECTION_LLM: ${REFLECTION_LLM:-"nvidia/llama-3_3-nemotron-super-49b-v1_5"}
-      # reflection llm server url. If "", Nvidia hosted API is used
-      REFLECTION_LLM_SERVERURL: ${REFLECTION_LLM_SERVERURL-"nim-llm:8000"}
-      # enable iterative query decomposition
-      ENABLE_QUERY_DECOMPOSITION: ${ENABLE_QUERY_DECOMPOSITION:-false}
-      # maximum recursion depth for iterative query decomposition
-      MAX_RECURSION_DEPTH: ${MAX_RECURSION_DEPTH:-3}
-
-    ports:
-      - "8081:8081"
-    expose:
-      - "8081"
-    shm_size: 5gb
-
-  # Sample UI container which interacts with APIs exposed by rag-server container
-  rag-playground:
-    container_name: rag-playground
-    image: nvcr.io/nvstaging/blueprint/rag-playground:${TAG:-2.3.0.rc0}
-    build:
-      # Set context to repo's root directory
-      context: ../../frontend
-      dockerfile: ./Dockerfile
-      args:
-        # Environment variables for Vite build
-        VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
-        VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-        VITE_MILVUS_URL: http://milvus:19530
-    ports:
-      - "8090:3000"
-    expose:
-      - "3000"
-    environment:
-      # Runtime environment variables for Vite
-      VITE_API_CHAT_URL: ${VITE_API_CHAT_URL:-http://rag-server:8081/v1}
-      VITE_API_VDB_URL: ${VITE_API_VDB_URL:-http://ingestor-server:8082/v1}
-      VITE_MILVUS_URL: http://milvus:19530
-    depends_on:
-      - rag-server
-
-networks:
-  default:
-    name: nvidia-rag
diff --git a/examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml b/examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
deleted file mode 100644
index ed9bf8403..000000000
--- a/examples/rag_library_mode/src/rag_library_mode/deploy/vectordb.yaml
+++ /dev/null
@@ -1,102 +0,0 @@
-services:
-
-  # Milvus can be made GPU accelerated by uncommenting the lines as specified below
-  milvus:
-    container_name: milvus-standalone
-    image: milvusdb/milvus:${MILVUS_VERSION:-v2.6.0-gpu} # milvusdb/milvus:v2.6.0 for CPU
-    command: ["milvus", "run", "standalone"]
-    environment:
-      ETCD_ENDPOINTS: etcd:2379
-      MINIO_ADDRESS: minio:9010
-      KNOWHERE_GPU_MEM_POOL_SIZE: 2048;4096
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/milvus:/var/lib/milvus
-    # healthcheck:
-    #   test: ["CMD", "curl", "-f", "http://localhost:9091/healthz"]
-    #   interval: 30s
-    #   start_period: 90s
-    #   timeout: 20s
-    #   retries: 3
-    ports:
-      - "19530:19530"
-      - "9091:9091"
-    depends_on:
-      - "etcd"
-      - "minio"
-    # Comment out this section if CPU based image is used and set below env variables to False
-    # export APP_VECTORSTORE_ENABLEGPUSEARCH=False
-    # export APP_VECTORSTORE_ENABLEGPUINDEX=False
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              capabilities: ["gpu"]
-              # count: ${INFERENCE_GPU_COUNT:-all}
-              device_ids: ['${VECTORSTORE_GPU_DEVICE_ID:-0}']
-    profiles: ["", "milvus"]
-
-  etcd:
-    container_name: milvus-etcd
-    image: quay.io/coreos/etcd:v3.6.4
-    environment:
-      - ETCD_AUTO_COMPACTION_MODE=revision
-      - ETCD_AUTO_COMPACTION_RETENTION=1000
-      - ETCD_QUOTA_BACKEND_BYTES=4294967296
-      - ETCD_SNAPSHOT_COUNT=50000
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/etcd:/etcd
-    command: etcd -advertise-client-urls=http://127.0.0.1:2379 -listen-client-urls http://0.0.0.0:2379 --data-dir /etcd
-    healthcheck:
-      test: ["CMD", "etcdctl", "endpoint", "health"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-    profiles: ["", "milvus"]
-
-  minio:
-    container_name: milvus-minio
-    image: minio/minio:RELEASE.2025-07-23T15-54-02Z
-    environment:
-      MINIO_ACCESS_KEY: minioadmin
-      MINIO_SECRET_KEY: minioadmin
-    ports:
-      - "9011:9011"
-      - "9010:9010"
-    volumes:
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/minio:/minio_data
-    command: minio server /minio_data --console-address ":9011" --address ":9010"
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:9010/minio/health/live"]
-      interval: 30s
-      timeout: 20s
-      retries: 3
-    profiles: ["", "milvus", "elasticsearch", "minio"]
-
-  elasticsearch:
-    container_name: elasticsearch
-    image: "docker.elastic.co/elasticsearch/elasticsearch:9.0.3"
-    ports:
-      - 9200:9200
-    volumes:
-      # Run "sudo chown -R 1000:1000 deploy/compose/volumes/elasticsearch/" to fix permissions
-      - ${DOCKER_VOLUME_DIRECTORY:-.}/volumes/elasticsearch:/usr/share/elasticsearch/data
-    restart: on-failure
-    environment:
-      - discovery.type=single-node
-      - "ES_JAVA_OPTS=-Xms1024m -Xmx1024m"
-      - xpack.security.enabled=false
-      - xpack.license.self_generated.type=basic
-      - network.host=0.0.0.0
-      - cluster.routing.allocation.disk.threshold_enabled=false
-    hostname: elasticsearch
-    healthcheck:
-      test: ["CMD", "curl", "-s", "-f", "http://localhost:9200/_cat/health"]
-      interval: 10s
-      timeout: 1s
-      retries: 10
-    profiles: ["elasticsearch"]
-
-networks:
-  default:
-    name: nvidia-rag
\ No newline at end of file
diff --git a/examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py b/examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
deleted file mode 100644
index 5f374f4ab..000000000
--- a/examples/rag_library_mode/src/rag_library_mode/rag_library_mode_function.py
+++ /dev/null
@@ -1,78 +0,0 @@
-import logging
-
-from pydantic import Field
-
-from nat.builder.builder import Builder
-from nat.builder.function_info import FunctionInfo
-from nat.cli.register_workflow import register_function
-from nat.data_models.function import FunctionBaseConfig
-
-from nvidia_rag import NvidiaRAG, NvidiaRAGIngestor
-
-import json
-import base64
-from IPython.display import display, Image, Markdown
-
-
-logger = logging.getLogger(__name__)
-
-
-class RagLibraryModeFunctionConfig(FunctionBaseConfig, name="rag_library_mode"):
-    """
-    This tool retrieves relevant documents for a given user query. The input query is mapped to the most appropriate
-    Milvus collection database. This will return relevant documents from the selected collection.
-    """
-    base_url: str = Field(description="The base url used to connect to the milvus database.")
-    reranker_top_k: int = Field(default=100, description="The number of results to return from the milvus database.")
-    vdb_top_k: int = Field(default=10, description="The number of results to return from the milvus database.")
-    collection_names: list = Field(default=["cuda_docs"],
-                                   description="The list of available collection names.")
-    
-
-
-@register_function(config_type=RagLibraryModeFunctionConfig)
-async def rag_library_mode_function(
-    config: RagLibraryModeFunctionConfig, builder: Builder
-):
-
-    def parse_search_citations(citations):
-
-        parsed_docs = []
-        
-        for idx, citation in enumerate(citations.results):
-        # If using pydantic models, citation fields may be attributes, not dict keys
-            content = getattr(citation, 'content', '')
-            doc_name = getattr(citation, 'document_name', f'Citation {idx+1}')
-            parsed_document = f'<Document source="{doc_name}"/>\n{content}\n</Document>'
-            parsed_docs.append(parsed_document)
-
-            # combine parsed documents into a single string
-            internal_search_docs = "\n\n---\n\n".join(parsed_docs)
-            return internal_search_docs
-
-    async def _response_fn(query: str) -> str:
-        # Process the input_message and generate output
-
-        rag = NvidiaRAG()
-        ingestor = NvidiaRAGIngestor()
-
-        # Just to debug
-        response = ingestor.get_documents(
-        collection_name=config.collection_names,
-        vdb_endpoint=config.base_url,
-        )
-        logger.info(f"***** {response}")
-        
-        return parse_search_citations(rag.search(
-            query=f"{query}",
-            collection_names=config.collection_names,
-            reranker_top_k=config.reranker_top_k,
-            vdb_top_k=config.vdb_top_k,
-        ))  
-
-    try:
-        yield FunctionInfo.create(single_fn=_response_fn)
-    except GeneratorExit:
-        logger.warning("Function exited early!")
-    finally:
-        logger.info("Cleaning up rag_library_mode workflow.")
\ No newline at end of file
diff --git a/examples/rag_library_mode/src/rag_library_mode/register.py b/examples/rag_library_mode/src/rag_library_mode/register.py
deleted file mode 100644
index cbdb8f3aa..000000000
--- a/examples/rag_library_mode/src/rag_library_mode/register.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# flake8: noqa
-
-# Import any tools which need to be automatically registered here
-from rag_library_mode import rag_library_mode_function
\ No newline at end of file