ansible · TamiTakamiya · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025 · Jan 15, 2025
diff --git a/.tekton/aap-rag-content-pull-request.yaml b/.tekton/aap-rag-content-pull-request.yaml
@@ -33,6 +33,13 @@ spec:
     value: Containerfile-aap
   - name: build-args
     value: [FLAVOR=gpu]
+  - name: mimir-enc-secret
+    value: dummy
+  - name: target-dirs
+    value: "red_hat_content/documentation/ansible_on_clouds/2.x 
+    red_hat_content/documentation/red_hat_ansible_automation_platform/2.5
+    red_hat_content/documentation/red_hat_ansible_lightspeed_with_ibm_watsonx_code_assistant/2.x_latest"
+
   taskRunSpecs:
   - pipelineTaskName: ecosystem-cert-preflight-checks
     stepSpecs:
@@ -185,14 +192,41 @@ spec:
         workspace: workspace
       - name: basic-auth
         workspace: git-auth
+    - name: clone-repository-2
+      params:
+      - name: url
+        value: https://gitlab.cee.redhat.com/jsprague/mimir-extracted-content-for-ai.git
+      - name: revision
+        value: latest
+      runAfter:
+      - clone-repository
+      taskRef:
+        params:
+        - name: name
+          value: git-clone
+        - name: bundle
+          value: quay.io/konflux-ci/tekton-catalog/task-git-clone:0.1@sha256:d091a9e19567a4cbdc5acd57903c71ba71dc51d749a4ba7477e689608851e981
+        - name: kind
+          value: task
+        resolver: bundles
+      when:
+      - input: $(tasks.init.results.build)
+        operator: in
+        values:
+        - "true"
+      workspaces:
+      - name: output
+        workspace: workspace
+      - name: basic-auth
+        workspace: git-auth
     - name: prefetch-dependencies
       params:
       - name: input
         value: $(params.prefetch-input)
       - name: dev-package-managers
         value: "true"
       runAfter:
-      - clone-repository
+      - clone-repository-2
       taskRef:
         params:
         - name: name
@@ -237,6 +271,8 @@ spec:
         value: $(tasks.clone-repository.results.commit)
       - name: BUILD_ARGS
         value:
+        - TARGET_DIRS=$(params.target-dirs)
+        - MIMIR_ENC_SECRET=$(params.mimir-enc-secret)
         - $(params.build-args[*])
       - name: BUILD_ARGS_FILE
         value: $(params.build-args-file)

diff --git a/Containerfile-aap b/Containerfile-aap
@@ -1,28 +1,43 @@
 ARG EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
 ARG FLAVOR=cpu
 ARG HERMETIC=false
+ARG MIMIR_ENC_SECRET
+ARG TARGET_DIRS
 
 FROM registry.access.redhat.com/ubi9/python-311 as cpu-base
 ARG EMBEDDING_MODEL
 ARG FLAVOR
+ARG MIMIR_ENC_SECRET
+ARG TARGET_DIRS
 
 FROM nvcr.io/nvidia/cuda:12.6.3-devel-ubi9 as gpu-base
 ARG EMBEDDING_MODEL
 ARG FLAVOR
+ARG MIMIR_ENC_SECRET
+ARG TARGET_DIRS
 RUN dnf install -y python3.11 python3.11-pip libcudnn9 libnccl
 
 FROM ${FLAVOR}-base as aap-rag-builder
 ARG EMBEDDING_MODEL
 ARG FLAVOR
 ARG AAP_VERSION=2.5
+ARG MIMIR_ENC_SECRET
+ARG TARGET_DIRS
+
+ENV MIMIR_ENC_SECRET=$MIMIR_ENC_SECRET
+
+RUN echo 1. TARGET_DIRS=$TARGET_DIRS
+RUN echo 2. TARGET_DIRS=${TARGET_DIRS}
+ENV TARGET_DIRS=$TARGET_DIRS
+RUN echo 3. TARGET_DIRS=$TARGET_DIRS
+RUN echo 4. TARGET_DIRS=${TARGET_DIRS}
 
 USER 0
 WORKDIR /workdir
 
 COPY pyproject.toml pdm.lock.* Makefile .
 RUN make install-tools && pdm config python.use_venv false && make pdm-lock-check install-deps
 
-COPY aap-product-docs-plaintext ./aap-product-docs-plaintext
 COPY additional_docs ./additional_docs
 
 COPY scripts/download_embeddings_model.py .
@@ -34,10 +49,13 @@ RUN if [ "$FLAVOR" == "gpu" ]; then \
         "import torch; print(torch.version.cuda); print(torch.cuda.is_available());"; \
     fi
 
+COPY scripts/download-mimir.sh .
+RUN ./download-mimir.sh
+
 COPY scripts/generate_embeddings-aap.py .
 RUN export LD_LIBRARY_PATH=/usr/local/cuda-12.6/compat:$LD_LIBRARY_PATH; \
 	set -e && pdm run python generate_embeddings-aap.py \
-  -f aap-product-docs-plaintext \
+  -f red_hat_content \
   -mn ${EMBEDDING_MODEL} \
   -o vector_db/aap_product_docs/${AAP_VERSION} \
   -i aap-product-docs-$(echo $AAP_VERSION | sed 's/\./_/g') \

diff --git a/scripts/download-mimir.sh b/scripts/download-mimir.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+
+if [[ -z "${MIMIR_ENC_SECRET}" ]]; then
+  echo Environment variable MIMIR_ENC_SECRET is not defined.
+  exit 1
+fi
+
+if [[ -z "${TARGET_DIRS}" ]]; then
+  echo Environment variable TARGET_DIRS is not defined.
+  exit 1
+fi
+
+OUT_DIR=/tmp/mimir_work
+trap "rm -rf ${OUTDIR}" EXIT
+
+mkdir -pv ${OUTDIR}
+
+git clone [email protected]:jsprague/mimir-extracted-content-for-ai.git ${OUT_DIR}/mimir-extracted-content-for-ai
+openssl enc -aes-256-cbc -d -pbkdf2 -pass pass:${MIMIR_ENC_SECRET} \
+  -in ${OUT_DIR}/mimir-extracted-content-for-ai/mimir-extract-latest.tgz.enc \
+  -out ${OUT_DIR}/mimir-extract-latest.tgz
+tar xvzf ${OUT_DIR}/mimir-extract-latest.tgz ${TARGET_DIRS}
diff --git a/scripts/generate_embeddings-aap.py b/scripts/generate_embeddings-aap.py
@@ -158,6 +158,9 @@ def got_whitespace(text: str) -> bool:
         faiss_index = faiss.index_cpu_to_gpu(gpu_resource, 0, faiss_index)
     except AssertionError:
         gpu_resource = None
+    except AttributeError:
+        gpu_resource = None
+
     vector_store = FaissVectorStore(faiss_index=faiss_index)
     storage_context = StorageContext.from_defaults(vector_store=vector_store)
 

diff --git a/scripts/mimir-parser.py b/scripts/mimir-parser.py
@@ -0,0 +1,147 @@
+import argparse
+import configparser
+import json
+import os
+import re
+import time
+
+class DocFinder:
+    def __init__(self, target_dirs):
+        self.target_dirs = target_dirs
+        self.base_dirs = []
+
+    def run(self):
+        for target_dir in self.target_dirs:
+            for doc_dir in os.listdir(target_dir):
+                full_path = os.path.join(target_dir, doc_dir)
+                if os.path.isdir(full_path):
+                    self.base_dirs.append(os.path.join(target_dir, doc_dir))
+
+
+class MimirParser:
+    def __init__(self, base_dir, out_dir):
+        self.base_dir = base_dir
+        self.out_dir = out_dir
+        self.metadata_dir = os.path.join(self.out_dir, ".metadata")
+
+        if not os.path.isdir(self.out_dir):
+            os.makedirs(self.out_dir)
+        if not os.path.isdir(self.metadata_dir):
+            os.makedirs(self.metadata_dir)
+
+        self.sections = []
+        self.toc = self.base_dir + "/toc/" + os.listdir(self.base_dir + "/toc")[0]
+        self.md = self.base_dir + "/single-page/" + \
+                  list(filter(lambda x: x.endswith(".md"),
+                              os.listdir(self.base_dir + "/single-page")))[0]
+
+    def process_section(self, section, level):
+        if "title" in section:
+            self.sections.append(section)
+
+        if "sections" in section:
+            for s in section["sections"]:
+                self.process_section(s, level + 1)
+
+    def process_toc(self):
+        with open(self.toc, encoding="utf-8") as f:
+            toc = json.loads(f.read())
+        self.process_section(toc, -1)
+
+
+    def process_md(self):
+        in_md_header = False
+        config = "[__default__]\n"
+        # Save Markdown files with .txt extension so that we can reuse the existing script
+        out_file = os.path.join(self.out_dir, "__index__.txt")
+        f = open(out_file, "w")
+        section_index = 1
+
+        for line in open(self.md, encoding="utf-8"):
+            line = line.strip()
+
+            if in_md_header:
+                if line == "+++":
+                    in_md_header = False
+                    self.doc_metadata = configparser.ConfigParser()
+                    self.doc_metadata.read_string(config)
+                    metadata_file = os.path.join(self.metadata_dir, "__index__.json")
+                    with open(metadata_file, "w") as meta:
+                        metadata = {s:dict(self.doc_metadata.items(s)) for s in self.doc_metadata.sections()}
+                        metadata["url"] = self.doc_metadata["extra"]["reference_url"]
+                        metadata["path"] = self.doc_metadata["__default__"]["path"]
+                        json.dump(metadata, meta, indent=2)
+                else:
+                    line = re.sub(r"^(.+)\s*=\s*\"(.+)\"", r"\1 = \2", line)
+                    config += line + "\n"
+                continue
+            elif line == "+++":
+                in_md_header = True
+                continue
+
+            if line.startswith("#"):
+                title_to_match = self.sections[section_index]["title"]
+                title = line.replace("\xa0", " ")
+                title = re.sub(r"^#+\s*", "", title)
+                title = re.sub(r"^Chapter\s*", "", title)
+                if title == title_to_match:
+                    base_url = self.doc_metadata["extra"]["reference_url"]
+                    single_page_anchor = self.sections[section_index]['singlePageAnchor']
+                    # print(f"[[ {base_url}#{single_page_anchor} ]]")
+                    section_index += 1
+
+                    if f:
+                        f.flush()
+                        f.close()
+                        f = None
+
+                    metadata_file = os.path.join(self.metadata_dir, f"{single_page_anchor}.json")
+                    with open(metadata_file, "w") as meta:
+                        metadata = dict()
+                        metadata["url"] = f"{base_url}#{single_page_anchor}"
+                        metadata["path"] = self.doc_metadata["__default__"]["path"]
+                        json.dump(metadata, meta, indent=2)
+
+                    # Save Markdown files with .txt extension so that we can reuse the existing script
+                    out_file = os.path.join(self.out_dir, single_page_anchor + ".txt")
+                    f = open(out_file, "w")
+
+            # print(line)
+            if f:
+                print(line, file=f)
+
+        if f:
+            f.flush()
+            f.close()
+    def run(self):
+        self.process_toc()
+        self.process_md()
+
+
+def main():
+    start = time.time()
+    try:
+        arg_parser = argparse.ArgumentParser()
+        arg_parser.add_argument("-o", "--out-dir", default="aap-product-docs-markdown")
+        args = arg_parser.parse_args()
+
+        if "TARGET_DIRS" not in os.environ:
+            TARGET_DIRS="red_hat_content/documentation/ansible_on_clouds/2.x " + \
+                        "red_hat_content/documentation/red_hat_ansible_automation_platform/2.5 " + \
+                        "red_hat_content/documentation/red_hat_ansible_lightspeed_with_ibm_watsonx_code_assistant/2.x_latest"
+        else:
+            TARGET_DIRS = os.getenv("TARGET_DIRS")
+
+        target_dirs = TARGET_DIRS.split()
+
+        doc_finder = DocFinder(target_dirs)
+        doc_finder.run()
+
+        for base_dir in doc_finder.base_dirs:
+            MimirParser(base_dir, os.path.join(args.out_dir, base_dir)).run()
+    finally:
+        print(f"Execution time: {(time.time() - start):.3f} secs")
+
+
+if __name__ == '__main__':
+    main()