Multi-Domain-Support-Triage/code/answer_synthesis.py at main · NITISH-R-G/Multi-Domain-Support-Triage · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Offline answer shaping: convert retrieved markdown-ish text into short actionable guidance."""
from __future__ import annotations

import hashlib
import re

from retrieve import Retrieved


_BULLET_LINE = re.compile(r"(?m)^\s*(?:[-*•]|\d+\.)\s+(.*)$")

# Lines that usually aren't helpful as user-facing steps.
_NOISE_PREFIXES = (
    "last updated:",
    "_last updated",
    "note:",
    "important:",
    "warning:",
)


def _clean_line(line: str) -> str:
    s = line.strip()
    s = re.sub(r"\s+", " ", s)
    return s


def _strip_heading_noise(text: str) -> str:
    # Remove markdown headings but keep their content lines handled separately.
    text = re.sub(r"(?m)^#+\s+.*$", "", text)
    return text


def extract_steps(text: str, *, max_steps: int = 8, max_chars_per_step: int = 260) -> list[str]:
    """Pull readable steps from support article bodies."""
    text = _strip_heading_noise(text)
    lines = [ln.rstrip() for ln in text.splitlines()]
    steps: list[str] = []

    # Prefer explicit bullets/numbered lists.
    for ln in lines:
        m = _BULLET_LINE.match(ln.strip())
        if not m:
            continue
        step = _clean_line(m.group(1))
        if not step:
            continue
        low = step.lower()
        if any(low.startswith(p) for p in _NOISE_PREFIXES):
            continue
        if len(step) > max_chars_per_step:
            step = step[: max_chars_per_step - 1] + "…"
        steps.append(step)
        if len(steps) >= max_steps:
            break

    # Fallback: split long paragraphs into sentences if no bullets exist.
    if not steps:
        blob = _clean_line(re.sub(r"\s+", " ", text))
        # naive sentence split (good enough for hackathon corpus text)
        parts = re.split(r"(?<=[.!?])\s+", blob)
        for p in parts:
            p = _clean_line(p)
            if len(p) < 40:
                continue
            low = p.lower()
            if any(low.startswith(pref) for pref in _NOISE_PREFIXES):
                continue
            if len(p) > max_chars_per_step:
                p = p[: max_chars_per_step - 1] + "…"
            steps.append(p)
            if len(steps) >= max_steps:
                break

    return steps[:max_steps]


def synthesize_reply_from_hits(hits: list[Retrieved], *, max_sources: int = 2) -> tuple[str, list[str]]:
    """Return (user_response, source_paths_used)."""
    if not hits:
        return "", []

    sources: list[str] = []
    blocks: list[str] = []

    for h in hits[:max_sources]:
        c = h.chunk
        sources.append(c.path)
        title = c.title.strip()
        steps = extract_steps(c.text)
        if not steps:
            # last resort small excerpt
            excerpt = re.sub(r"\s+", " ", c.text).strip()
            excerpt = excerpt[:700] + ("…" if len(excerpt) > 700 else "")
            blocks.append(f"From {title}:\n{excerpt}")
            continue

        rendered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(steps))
        blocks.append(f"From {title}:\n{rendered}")

    body = "\n\n".join(blocks).strip()
    # Short, varied closings (deterministic per content hash — less repetitive than one fixed paragraph).
    _closings = (
        "If this doesn’t match what you see, contact official support for your product.",
        "If you still need help, use your product’s official support channel.",
        "For anything still unclear, reach out through the official support path for your product.",
    )
    h = int(hashlib.sha256(body.encode("utf-8")).hexdigest(), 16)
    body += "\n\n" + _closings[h % len(_closings)]
    return body, sources