-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathanswer_synthesis.py
More file actions
110 lines (90 loc) · 3.54 KB
/
answer_synthesis.py
File metadata and controls
110 lines (90 loc) · 3.54 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
"""Offline answer shaping: convert retrieved markdown-ish text into short actionable guidance."""
from __future__ import annotations
import hashlib
import re
from retrieve import Retrieved
_BULLET_LINE = re.compile(r"(?m)^\s*(?:[-*•]|\d+\.)\s+(.*)$")
# Lines that usually aren't helpful as user-facing steps.
_NOISE_PREFIXES = (
"last updated:",
"_last updated",
"note:",
"important:",
"warning:",
)
def _clean_line(line: str) -> str:
s = line.strip()
s = re.sub(r"\s+", " ", s)
return s
def _strip_heading_noise(text: str) -> str:
# Remove markdown headings but keep their content lines handled separately.
text = re.sub(r"(?m)^#+\s+.*$", "", text)
return text
def extract_steps(text: str, *, max_steps: int = 8, max_chars_per_step: int = 260) -> list[str]:
"""Pull readable steps from support article bodies."""
text = _strip_heading_noise(text)
lines = [ln.rstrip() for ln in text.splitlines()]
steps: list[str] = []
# Prefer explicit bullets/numbered lists.
for ln in lines:
m = _BULLET_LINE.match(ln.strip())
if not m:
continue
step = _clean_line(m.group(1))
if not step:
continue
low = step.lower()
if any(low.startswith(p) for p in _NOISE_PREFIXES):
continue
if len(step) > max_chars_per_step:
step = step[: max_chars_per_step - 1] + "…"
steps.append(step)
if len(steps) >= max_steps:
break
# Fallback: split long paragraphs into sentences if no bullets exist.
if not steps:
blob = _clean_line(re.sub(r"\s+", " ", text))
# naive sentence split (good enough for hackathon corpus text)
parts = re.split(r"(?<=[.!?])\s+", blob)
for p in parts:
p = _clean_line(p)
if len(p) < 40:
continue
low = p.lower()
if any(low.startswith(pref) for pref in _NOISE_PREFIXES):
continue
if len(p) > max_chars_per_step:
p = p[: max_chars_per_step - 1] + "…"
steps.append(p)
if len(steps) >= max_steps:
break
return steps[:max_steps]
def synthesize_reply_from_hits(hits: list[Retrieved], *, max_sources: int = 2) -> tuple[str, list[str]]:
"""Return (user_response, source_paths_used)."""
if not hits:
return "", []
sources: list[str] = []
blocks: list[str] = []
for h in hits[:max_sources]:
c = h.chunk
sources.append(c.path)
title = c.title.strip()
steps = extract_steps(c.text)
if not steps:
# last resort small excerpt
excerpt = re.sub(r"\s+", " ", c.text).strip()
excerpt = excerpt[:700] + ("…" if len(excerpt) > 700 else "")
blocks.append(f"From {title}:\n{excerpt}")
continue
rendered = "\n".join(f"{i+1}. {s}" for i, s in enumerate(steps))
blocks.append(f"From {title}:\n{rendered}")
body = "\n\n".join(blocks).strip()
# Short, varied closings (deterministic per content hash — less repetitive than one fixed paragraph).
_closings = (
"If this doesn’t match what you see, contact official support for your product.",
"If you still need help, use your product’s official support channel.",
"For anything still unclear, reach out through the official support path for your product.",
)
h = int(hashlib.sha256(body.encode("utf-8")).hexdigest(), 16)
body += "\n\n" + _closings[h % len(_closings)]
return body, sources