Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/tools/search/highlights.ts
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,15 @@ export function expandHighlights(
!result.highlights ||
result.highlights.length === 0
) {
return result; // No modification needed
if (result.content == null && result.references == null) {
return result;
}
/** Raw scraped content must never leave this function β€” without
* highlights to expand, strip it instead of passing it downstream */
const strippedResult = { ...result };
delete strippedResult.content;
delete strippedResult.references;
return strippedResult;
}

// Create a shallow copy with expanded highlights
Expand Down
37 changes: 35 additions & 2 deletions src/tools/search/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ const chunker = {
},
};

const DEFAULT_MAX_CONTENT_LENGTH = 50000;

/** Resolves the per-source scraped content cap from config, the
* `SEARCH_MAX_CONTENT_LENGTH` env var, or the default (50,000 chars) */
function resolveMaxContentLength(maxContentLength?: number): number {
if (maxContentLength != null && maxContentLength > 0) {
return maxContentLength;
}
const envValue = Number(process.env.SEARCH_MAX_CONTENT_LENGTH);
if (Number.isFinite(envValue) && envValue > 0) {
return envValue;
}
return DEFAULT_MAX_CONTENT_LENGTH;
}

function truncateContent(content: string, maxLength: number): string {
return content.length > maxLength ? content.slice(0, maxLength) : content;
}

function createSourceUpdateCallback(sourceMap: Map<string, t.ValidSource>) {
return (link: string, update?: Partial<t.ValidSource>): void => {
const source = sourceMap.get(link);
Expand All @@ -83,12 +102,14 @@ const getHighlights = async ({
content,
reranker,
topResults = 5,
maxContentLength = DEFAULT_MAX_CONTENT_LENGTH,
logger,
}: {
content: string;
query: string;
reranker?: BaseReranker;
topResults?: number;
maxContentLength?: number;
logger?: t.Logger;
}): Promise<t.Highlight[] | undefined> => {
const logger_ = logger || createDefaultLogger();
Expand All @@ -103,7 +124,9 @@ const getHighlights = async ({
}

try {
const documents = await chunker.splitText(content);
const documents = await chunker.splitText(
truncateContent(content, maxContentLength)
);
if (Array.isArray(documents)) {
return await reranker.rerank(query, documents, topResults);
} else {
Expand Down Expand Up @@ -457,6 +480,7 @@ export const createSourceProcessor = (
logger,
} = config;

const maxContentLength = resolveMaxContentLength(config.maxContentLength);
const logger_ = logger || createDefaultLogger();
const scraper = scraperInstance;

Expand All @@ -475,7 +499,7 @@ export const createSourceProcessor = (
url,
references,
attribution,
content: chunker.cleanText(content),
content: truncateContent(chunker.cleanText(content), maxContentLength),
};
}

Expand All @@ -498,6 +522,7 @@ export const createSourceProcessor = (
query,
reranker,
content: result.content,
maxContentLength,
logger: logger_,
});
if (onGetHighlights) {
Expand Down Expand Up @@ -705,6 +730,14 @@ export const createSourceProcessor = (
updateSourcesWithContent(topStories, sourceMap);
}

if (topStories.length > numElements) {

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

P2 Badge Move the topStories cap before early returns

When a general search returns no organic links but does return more than numElements top stories while news is false, this new cap is never reached because processSources returns earlier when organicLinks.length === 0 && ... !news. formatResultsForLLM still formats every topStories entry, so this leaves the oversized news payload uncapped in exactly the no-organic fallback case; apply the cap before that early return or cap in the returned branch as well.

Useful? React with πŸ‘Β / πŸ‘Ž.

Copy link
Copy Markdown
Owner Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Addressed in 1b496c0 β€” the cap now runs at the top of processSources, before the no-organic and empty-links early returns (and before scraping, so entries the cap would discard are no longer fetched). Added regression tests for both early-return paths.

/** Merged news results can far exceed the requested source count;
* every entry here is formatted into the LLM output, so cap them
* like organic results (sliced after enrichment to keep the
* scraped/reranked entries, which come first) */
result.data.topStories = topStories.slice(0, numElements);
}

return result.data;
} catch (error) {
logger_.error('Error in processSources:', error);
Expand Down
Loading
Loading