Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use Mimir extract as the document source #62

Draft
wants to merge 15 commits into
base: main
Choose a base branch
from
Draft
38 changes: 37 additions & 1 deletion .tekton/aap-rag-content-pull-request.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,13 @@ spec:
value: Containerfile-aap
- name: build-args
value: [FLAVOR=gpu]
- name: mimir-enc-secret
value: dummy
- name: target-dirs
value: "red_hat_content/documentation/ansible_on_clouds/2.x
red_hat_content/documentation/red_hat_ansible_automation_platform/2.5
red_hat_content/documentation/red_hat_ansible_lightspeed_with_ibm_watsonx_code_assistant/2.x_latest"

taskRunSpecs:
- pipelineTaskName: ecosystem-cert-preflight-checks
stepSpecs:
Expand Down Expand Up @@ -185,14 +192,41 @@ spec:
workspace: workspace
- name: basic-auth
workspace: git-auth
- name: clone-repository-2
params:
- name: url
value: https://gitlab.cee.redhat.com/jsprague/mimir-extracted-content-for-ai.git
- name: revision
value: latest
runAfter:
- clone-repository
taskRef:
params:
- name: name
value: git-clone
- name: bundle
value: quay.io/konflux-ci/tekton-catalog/task-git-clone:0.1@sha256:d091a9e19567a4cbdc5acd57903c71ba71dc51d749a4ba7477e689608851e981
- name: kind
value: task
resolver: bundles
when:
- input: $(tasks.init.results.build)
operator: in
values:
- "true"
workspaces:
- name: output
workspace: workspace
- name: basic-auth
workspace: git-auth
- name: prefetch-dependencies
params:
- name: input
value: $(params.prefetch-input)
- name: dev-package-managers
value: "true"
runAfter:
- clone-repository
- clone-repository-2
taskRef:
params:
- name: name
Expand Down Expand Up @@ -237,6 +271,8 @@ spec:
value: $(tasks.clone-repository.results.commit)
- name: BUILD_ARGS
value:
- TARGET_DIRS=$(params.target-dirs)
- MIMIR_ENC_SECRET=$(params.mimir-enc-secret)
- $(params.build-args[*])
- name: BUILD_ARGS_FILE
value: $(params.build-args-file)
Expand Down
22 changes: 20 additions & 2 deletions Containerfile-aap
Original file line number Diff line number Diff line change
@@ -1,28 +1,43 @@
ARG EMBEDDING_MODEL=sentence-transformers/all-mpnet-base-v2
ARG FLAVOR=cpu
ARG HERMETIC=false
ARG MIMIR_ENC_SECRET
ARG TARGET_DIRS

FROM registry.access.redhat.com/ubi9/python-311 as cpu-base
ARG EMBEDDING_MODEL
ARG FLAVOR
ARG MIMIR_ENC_SECRET
ARG TARGET_DIRS

FROM nvcr.io/nvidia/cuda:12.6.3-devel-ubi9 as gpu-base
ARG EMBEDDING_MODEL
ARG FLAVOR
ARG MIMIR_ENC_SECRET
ARG TARGET_DIRS
RUN dnf install -y python3.11 python3.11-pip libcudnn9 libnccl

FROM ${FLAVOR}-base as aap-rag-builder
ARG EMBEDDING_MODEL
ARG FLAVOR
ARG AAP_VERSION=2.5
ARG MIMIR_ENC_SECRET
ARG TARGET_DIRS

ENV MIMIR_ENC_SECRET=$MIMIR_ENC_SECRET

RUN echo 1. TARGET_DIRS=$TARGET_DIRS
RUN echo 2. TARGET_DIRS=${TARGET_DIRS}
ENV TARGET_DIRS=$TARGET_DIRS
RUN echo 3. TARGET_DIRS=$TARGET_DIRS
RUN echo 4. TARGET_DIRS=${TARGET_DIRS}

USER 0
WORKDIR /workdir

COPY pyproject.toml pdm.lock.* Makefile .
RUN make install-tools && pdm config python.use_venv false && make pdm-lock-check install-deps

COPY aap-product-docs-plaintext ./aap-product-docs-plaintext
COPY additional_docs ./additional_docs

COPY scripts/download_embeddings_model.py .
Expand All @@ -34,10 +49,13 @@ RUN if [ "$FLAVOR" == "gpu" ]; then \
"import torch; print(torch.version.cuda); print(torch.cuda.is_available());"; \
fi

COPY scripts/download-mimir.sh .
RUN ./download-mimir.sh

COPY scripts/generate_embeddings-aap.py .
RUN export LD_LIBRARY_PATH=/usr/local/cuda-12.6/compat:$LD_LIBRARY_PATH; \
set -e && pdm run python generate_embeddings-aap.py \
-f aap-product-docs-plaintext \
-f red_hat_content \
-mn ${EMBEDDING_MODEL} \
-o vector_db/aap_product_docs/${AAP_VERSION} \
-i aap-product-docs-$(echo $AAP_VERSION | sed 's/\./_/g') \
Expand Down
22 changes: 22 additions & 0 deletions scripts/download-mimir.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/bin/bash

if [[ -z "${MIMIR_ENC_SECRET}" ]]; then
echo Environment variable MIMIR_ENC_SECRET is not defined.
exit 1
fi

if [[ -z "${TARGET_DIRS}" ]]; then
echo Environment variable TARGET_DIRS is not defined.
exit 1
fi

OUT_DIR=/tmp/mimir_work
trap "rm -rf ${OUTDIR}" EXIT

mkdir -pv ${OUTDIR}

git clone [email protected]:jsprague/mimir-extracted-content-for-ai.git ${OUT_DIR}/mimir-extracted-content-for-ai
openssl enc -aes-256-cbc -d -pbkdf2 -pass pass:${MIMIR_ENC_SECRET} \
-in ${OUT_DIR}/mimir-extracted-content-for-ai/mimir-extract-latest.tgz.enc \
-out ${OUT_DIR}/mimir-extract-latest.tgz
tar xvzf ${OUT_DIR}/mimir-extract-latest.tgz ${TARGET_DIRS}
3 changes: 3 additions & 0 deletions scripts/generate_embeddings-aap.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,9 @@ def got_whitespace(text: str) -> bool:
faiss_index = faiss.index_cpu_to_gpu(gpu_resource, 0, faiss_index)
except AssertionError:
gpu_resource = None
except AttributeError:
gpu_resource = None

vector_store = FaissVectorStore(faiss_index=faiss_index)
storage_context = StorageContext.from_defaults(vector_store=vector_store)

Expand Down
147 changes: 147 additions & 0 deletions scripts/mimir-parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
import argparse
import configparser
import json
import os
import re
import time

class DocFinder:
def __init__(self, target_dirs):
self.target_dirs = target_dirs
self.base_dirs = []

def run(self):
for target_dir in self.target_dirs:
for doc_dir in os.listdir(target_dir):
full_path = os.path.join(target_dir, doc_dir)
if os.path.isdir(full_path):
self.base_dirs.append(os.path.join(target_dir, doc_dir))


class MimirParser:
def __init__(self, base_dir, out_dir):
self.base_dir = base_dir
self.out_dir = out_dir
self.metadata_dir = os.path.join(self.out_dir, ".metadata")

if not os.path.isdir(self.out_dir):
os.makedirs(self.out_dir)
if not os.path.isdir(self.metadata_dir):
os.makedirs(self.metadata_dir)

self.sections = []
self.toc = self.base_dir + "/toc/" + os.listdir(self.base_dir + "/toc")[0]
self.md = self.base_dir + "/single-page/" + \
list(filter(lambda x: x.endswith(".md"),
os.listdir(self.base_dir + "/single-page")))[0]

def process_section(self, section, level):
if "title" in section:
self.sections.append(section)

if "sections" in section:
for s in section["sections"]:
self.process_section(s, level + 1)

def process_toc(self):
with open(self.toc, encoding="utf-8") as f:
toc = json.loads(f.read())
self.process_section(toc, -1)


def process_md(self):
in_md_header = False
config = "[__default__]\n"
# Save Markdown files with .txt extension so that we can reuse the existing script
out_file = os.path.join(self.out_dir, "__index__.txt")
f = open(out_file, "w")
section_index = 1

for line in open(self.md, encoding="utf-8"):
line = line.strip()

if in_md_header:
if line == "+++":
in_md_header = False
self.doc_metadata = configparser.ConfigParser()
self.doc_metadata.read_string(config)
metadata_file = os.path.join(self.metadata_dir, "__index__.json")
with open(metadata_file, "w") as meta:
metadata = {s:dict(self.doc_metadata.items(s)) for s in self.doc_metadata.sections()}
metadata["url"] = self.doc_metadata["extra"]["reference_url"]
metadata["path"] = self.doc_metadata["__default__"]["path"]
json.dump(metadata, meta, indent=2)
else:
line = re.sub(r"^(.+)\s*=\s*\"(.+)\"", r"\1 = \2", line)
config += line + "\n"
continue
elif line == "+++":
in_md_header = True
continue

if line.startswith("#"):
title_to_match = self.sections[section_index]["title"]
title = line.replace("\xa0", " ")
title = re.sub(r"^#+\s*", "", title)
title = re.sub(r"^Chapter\s*", "", title)
if title == title_to_match:
base_url = self.doc_metadata["extra"]["reference_url"]
single_page_anchor = self.sections[section_index]['singlePageAnchor']
# print(f"[[ {base_url}#{single_page_anchor} ]]")
section_index += 1

if f:
f.flush()
f.close()
f = None

metadata_file = os.path.join(self.metadata_dir, f"{single_page_anchor}.json")
with open(metadata_file, "w") as meta:
metadata = dict()
metadata["url"] = f"{base_url}#{single_page_anchor}"
metadata["path"] = self.doc_metadata["__default__"]["path"]
json.dump(metadata, meta, indent=2)

# Save Markdown files with .txt extension so that we can reuse the existing script
out_file = os.path.join(self.out_dir, single_page_anchor + ".txt")
f = open(out_file, "w")

# print(line)
if f:
print(line, file=f)

if f:
f.flush()
f.close()
def run(self):
self.process_toc()
self.process_md()


def main():
start = time.time()
try:
arg_parser = argparse.ArgumentParser()
arg_parser.add_argument("-o", "--out-dir", default="aap-product-docs-markdown")
args = arg_parser.parse_args()

if "TARGET_DIRS" not in os.environ:
TARGET_DIRS="red_hat_content/documentation/ansible_on_clouds/2.x " + \
"red_hat_content/documentation/red_hat_ansible_automation_platform/2.5 " + \
"red_hat_content/documentation/red_hat_ansible_lightspeed_with_ibm_watsonx_code_assistant/2.x_latest"
else:
TARGET_DIRS = os.getenv("TARGET_DIRS")

target_dirs = TARGET_DIRS.split()

doc_finder = DocFinder(target_dirs)
doc_finder.run()

for base_dir in doc_finder.base_dirs:
MimirParser(base_dir, os.path.join(args.out_dir, base_dir)).run()
finally:
print(f"Execution time: {(time.time() - start):.3f} secs")


if __name__ == '__main__':
main()