From 87e69e6467217c0ec00e9e7860dd5d2723d0d2d5 Mon Sep 17 00:00:00 2001 From: bars0um Date: Fri, 10 Oct 2025 08:47:27 +0300 Subject: [PATCH 1/3] improve content extraction --- modules/web_search.py | 261 +++++++++++++++++++++++++++++++++--------- 1 file changed, 208 insertions(+), 53 deletions(-) mode change 100644 => 100755 modules/web_search.py diff --git a/modules/web_search.py b/modules/web_search.py old mode 100644 new mode 100755 index 597af4b289..c1f70e3d5d --- a/modules/web_search.py +++ b/modules/web_search.py @@ -5,7 +5,7 @@ import urllib.request from concurrent.futures import as_completed from datetime import datetime -from urllib.parse import quote_plus +from urllib.parse import quote_plus, urlparse, parse_qs, unquote import requests @@ -20,35 +20,201 @@ def get_current_timestamp(): def download_web_page(url, timeout=10): """ - Download a web page and convert its HTML content to structured Markdown text. + Download a web page and convert its HTML content to Markdown text, + handling Brotli/gzip and non-HTML content robustly. """ - import html2text + logger.info(f"download_web_page {url}") + # --- soft deps try: - headers = { - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' - } - response = requests.get(url, headers=headers, timeout=timeout) - response.raise_for_status() # Raise an exception for bad status codes - - # Initialize the HTML to Markdown converter - h = html2text.HTML2Text() - h.body_width = 0 - h.ignore_images = True - h.ignore_links = True - - # Convert the HTML to Markdown - markdown_text = h.handle(response.text) - - return markdown_text + import html2text + except Exception: + logger.exception("html2text import failed") + html2text = None + + try: + from readability import Document + except Exception: + Document = None + + try: + import brotli as _brotli + have_brotli = True + except Exception: + _brotli = None + have_brotli = False + + import gzip, zlib, re, html as _html + + headers = { + "User-Agent": ( + "Mozilla/5.0 (Windows NT 10.0; Win64; x64) " + "AppleWebKit/537.36 (KHTML, like Gecko) " + "Chrome/124.0.0.0 Safari/537.36" + ), + "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "Accept-Language": "en-US,en;q=0.9", + # IMPORTANT: only advertise br if brotli is installed + "Accept-Encoding": "gzip, deflate" + (", br" if have_brotli else ""), + "Connection": "keep-alive", + "Upgrade-Insecure-Requests": "1", + } + + try: + resp = requests.get(url, headers=headers, timeout=timeout) + resp.raise_for_status() + + # --- bail out early if it's not HTML + ctype = resp.headers.get("Content-Type", "").lower() + if not any(t in ctype for t in ("text/html", "application/xhtml+xml")): + logger.warning("Non-HTML content-type %r at %s", ctype, url) + return "" + + # --- get raw bytes then decompress if server didn't/requests couldn't + raw = resp.content # bytes + enc_hdr = resp.headers.get("Content-Encoding", "").lower() + + # If requests didn't decode (it normally does gzip/deflate), handle manually. + if "br" in enc_hdr and have_brotli: + try: + raw = _brotli.decompress(raw) + except Exception: + # it may already be decoded; ignore + pass + elif "gzip" in enc_hdr: + try: + raw = gzip.decompress(raw) + except Exception: + pass + elif "deflate" in enc_hdr: + try: + raw = zlib.decompress(raw, -zlib.MAX_WBITS) + except Exception: + pass + + # --- decode text with a robust charset guess + # use HTTP charset if present + charset = None + if "charset=" in ctype: + charset = ctype.split("charset=")[-1].split(";")[0].strip() + if not charset: + # requests’ detector + charset = resp.apparent_encoding or "utf-8" + try: + html_text = raw.decode(charset, errors="replace") + except Exception: + html_text = raw.decode("utf-8", errors="replace") + + # anti-bot shells (avoid empty output surprises) + if re.search(r"(cf-chl|Just a moment|enable JavaScript)", html_text, re.I): + logger.warning("Possible anti-bot/challenge page at %s", url) + + # --- extract readable text (readability -> html2text -> fallback) + md_readability = "" + if Document is not None: + try: + doc = Document(html_text) + title = (doc.short_title() or "").strip() + main_html = doc.summary(html_partial=True) + main_text = re.sub(r"<[^>]+>", " ", main_html, flags=re.S) + main_text = re.sub(r"\s+", " ", main_text).strip() + if title: + md_readability = f"# {title}\n\n{main_text}".strip() + else: + md_readability = main_text + except Exception: + logger.exception("readability failed on %s", url) + + md_html2text = "" + if html2text is not None: + try: + h = html2text.HTML2Text() + h.body_width = 0 + h.ignore_images = True + h.ignore_links = True + h.single_line_break = True + md_html2text = (h.handle(html_text) or "").strip() + except Exception: + logger.exception("html2text failed on %s", url) + + def _clean(s): + s = re.sub(r"<[^>]+>", " ", s, flags=re.S) + return _html.unescape(re.sub(r"\s+", " ", s)).strip() + + # fallback: meta/title/headers/paragraphs + noscript + parts = [] + t = re.search(r"]*>(.*?)", html_text, re.I | re.S) + if t: + parts.append(f"# {_clean(t.group(1))}") + + for pat in [ + r']+property=["\']og:title["\'][^>]+content=["\'](.*?)["\']', + r']+property=["\']og:description["\'][^>]+content=["\'](.*?)["\']', + r']+name=["\']description["\'][^>]+content=["\'](.*?)["\']', + r']+name=["\']twitter:description["\'][^>]+content=["\'](.*?)["\']', + ]: + m = re.search(pat, html_text, re.I | re.S) + if m: + parts.append(_clean(m.group(1))) + + parts += [f"## {_clean(h)}" for h in re.findall(r"]*>(.*?)", html_text, re.I | re.S)[:4] if _clean(h)] + parts += [_clean(p) for p in re.findall(r"]*>(.*?)

", html_text, re.I | re.S)[:8] if _clean(p)] + for n in re.findall(r"]*>(.*?)", html_text, re.I | re.S): + c = _clean(n) + if c: + parts.append(c) + md_fallback = "\n\n".join([p for p in parts if p]).strip() + + best = max([md_readability, md_html2text, md_fallback], key=lambda s: len(s or "")) + if not best.strip(): + logger.warning("Empty content extracted from %s", url) + return best + except requests.exceptions.RequestException as e: logger.error(f"Error downloading {url}: {e}") return "" - except Exception as e: - logger.error(f"An unexpected error occurred: {e}") + except Exception: + logger.exception("Unexpected error while downloading %s", url) return "" + +def _extract_results_from_duckduckgo(response_text, num_pages): + # 1) Grab the title anchors (they carry the real clickable href) + # We capture both the inner text (title) and href. + anchor_pattern = re.compile( + r']*class="[^"]*result__a[^"]*"[^>]*href="([^"]+)"[^>]*>(.*?)', + re.DOTALL | re.IGNORECASE + ) + matches = anchor_pattern.findall(response_text) + + results = [] + for href, title_html in matches: + # 2) Resolve DuckDuckGo redirect: ?uddg= + parsed = urlparse(href) + target_url = href + if parsed.netloc.endswith("duckduckgo.com"): + qs = parse_qs(parsed.query) + if "uddg" in qs and qs["uddg"]: + target_url = unquote(qs["uddg"][0]) + + # 3) Clean title + title_text = re.sub(r'<[^>]+>', '', title_html).strip() + title_text = html.unescape(title_text) + + # 4) Basic normalization: add scheme if missing + if target_url.startswith("//"): + target_url = "https:" + target_url + elif not re.match(r'^https?://', target_url, flags=re.I): + target_url = "https://" + target_url + + results.append((target_url, title_text)) + + if len(results) >= num_pages: + break + + return results + def perform_web_search(query, num_pages=3, max_workers=5, timeout=10): """Perform web search and return results with content""" try: @@ -59,49 +225,37 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10): "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36" ] - response_text = "" req = urllib.request.Request(search_url, headers={'User-Agent': random.choice(agents)}) with urllib.request.urlopen(req, timeout=timeout) as response: - response_text = response.read().decode('utf-8') + response_text = response.read().decode('utf-8', errors='replace') - # Extract results with regex - titles = re.findall(r']*class="[^"]*result__a[^"]*"[^>]*>(.*?)', response_text, re.DOTALL) - urls = re.findall(r']*class="[^"]*result__url[^"]*"[^>]*>(.*?)', response_text, re.DOTALL) + # Extract (url, title) pairs from the proper anchors + download_tasks = _extract_results_from_duckduckgo(response_text, num_pages) - # Prepare download tasks - download_tasks = [] - for i in range(min(len(titles), len(urls), num_pages)): - url = f"https://{urls[i].strip()}" - title = re.sub(r'<[^>]+>', '', titles[i]).strip() - title = html.unescape(title) - download_tasks.append((url, title, i)) + if not download_tasks: + return [] - search_results = [None] * len(download_tasks) # Pre-allocate to maintain order + search_results = [None] * len(download_tasks) # Download pages in parallel with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor: - # Submit all download tasks - future_to_task = { - executor.submit(download_web_page, task[0]): task - for task in download_tasks + future_to_index = { + executor.submit(download_web_page, url, timeout): (i, url, title) + for i, (url, title) in enumerate(download_tasks) } - # Collect results as they complete - for future in as_completed(future_to_task): - url, title, index = future_to_task[future] + for future in as_completed(future_to_index): + i, url, title = future_to_index[future] try: content = future.result() - search_results[index] = { - 'title': title, - 'url': url, - 'content': content - } except Exception: - search_results[index] = { - 'title': title, - 'url': url, - 'content': '' - } + content = "" + + search_results[i] = { + "title": title, + "url": url, + "content": content or "" + } return search_results @@ -109,7 +263,6 @@ def perform_web_search(query, num_pages=3, max_workers=5, timeout=10): logger.error(f"Error performing web search: {e}") return [] - def truncate_content_by_tokens(content, max_tokens=8192): """Truncate content to fit within token limit using binary search""" if len(shared.tokenizer.encode(content)) <= max_tokens: @@ -128,12 +281,13 @@ def truncate_content_by_tokens(content, max_tokens=8192): def add_web_search_attachments(history, row_idx, user_message, search_query, state): """Perform web search and add results as attachments""" + logger.info(f"add_web_search_attachments") if not search_query: logger.warning("No search query provided") return try: - logger.info(f"Using search query: {search_query}") + logger.info(f"Add Web Search - Using search query: {search_query}") # Perform web search num_pages = int(state.get('web_search_pages', 3)) @@ -144,6 +298,7 @@ def add_web_search_attachments(history, row_idx, user_message, search_query, sta return # Filter out failed downloads before adding attachments + # logger.info(f"search_results {search_results}") successful_results = [result for result in search_results if result['content'].strip()] if not successful_results: From 87a3565164863112b23f8d43997b0248fe888893 Mon Sep 17 00:00:00 2001 From: bars0um Date: Fri, 10 Oct 2025 09:19:24 +0300 Subject: [PATCH 2/3] reduce noisy logs --- modules/web_search.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/modules/web_search.py b/modules/web_search.py index c1f70e3d5d..cc991d2b94 100755 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -23,7 +23,7 @@ def download_web_page(url, timeout=10): Download a web page and convert its HTML content to Markdown text, handling Brotli/gzip and non-HTML content robustly. """ - logger.info(f"download_web_page {url}") + logger.info(f"Downloading {url}") # --- soft deps try: @@ -281,14 +281,12 @@ def truncate_content_by_tokens(content, max_tokens=8192): def add_web_search_attachments(history, row_idx, user_message, search_query, state): """Perform web search and add results as attachments""" - logger.info(f"add_web_search_attachments") + logger.debug(f"add_web_search_attachments") if not search_query: logger.warning("No search query provided") return try: - logger.info(f"Add Web Search - Using search query: {search_query}") - # Perform web search num_pages = int(state.get('web_search_pages', 3)) search_results = perform_web_search(search_query, num_pages) @@ -298,7 +296,6 @@ def add_web_search_attachments(history, row_idx, user_message, search_query, sta return # Filter out failed downloads before adding attachments - # logger.info(f"search_results {search_results}") successful_results = [result for result in search_results if result['content'].strip()] if not successful_results: From e1ccd32e222f11cf69942235c23f3b129255d811 Mon Sep 17 00:00:00 2001 From: bars0um Date: Fri, 10 Oct 2025 09:29:33 +0300 Subject: [PATCH 3/3] reduce noisy logs --- modules/web_search.py | 1 - 1 file changed, 1 deletion(-) diff --git a/modules/web_search.py b/modules/web_search.py index cc991d2b94..7ebe76df0b 100755 --- a/modules/web_search.py +++ b/modules/web_search.py @@ -281,7 +281,6 @@ def truncate_content_by_tokens(content, max_tokens=8192): def add_web_search_attachments(history, row_idx, user_message, search_query, state): """Perform web search and add results as attachments""" - logger.debug(f"add_web_search_attachments") if not search_query: logger.warning("No search query provided") return