Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 9 additions & 1 deletion src/tools/search/highlights.ts
Original file line number Diff line number Diff line change
Expand Up @@ -240,7 +240,15 @@ export function expandHighlights(
!result.highlights ||
result.highlights.length === 0
) {
return result; // No modification needed
if (result.content == null && result.references == null) {
return result;
}
/** Raw scraped content must never leave this function β€” without
* highlights to expand, strip it instead of passing it downstream */
const strippedResult = { ...result };
delete strippedResult.content;
delete strippedResult.references;
return strippedResult;
}

// Create a shallow copy with expanded highlights
Expand Down
44 changes: 41 additions & 3 deletions src/tools/search/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,25 @@ const chunker = {
},
};

const DEFAULT_MAX_CONTENT_LENGTH = 50000;

/** Resolves the per-source scraped content cap from config, the
* `SEARCH_MAX_CONTENT_LENGTH` env var, or the default (50,000 chars) */
function resolveMaxContentLength(maxContentLength?: number): number {
if (maxContentLength != null && maxContentLength > 0) {
return maxContentLength;
}
const envValue = Number(process.env.SEARCH_MAX_CONTENT_LENGTH);
if (Number.isFinite(envValue) && envValue > 0) {
return envValue;
}
return DEFAULT_MAX_CONTENT_LENGTH;
}

function truncateContent(content: string, maxLength: number): string {
return content.length > maxLength ? content.slice(0, maxLength) : content;
}

function createSourceUpdateCallback(sourceMap: Map<string, t.ValidSource>) {
return (link: string, update?: Partial<t.ValidSource>): void => {
const source = sourceMap.get(link);
Expand All @@ -83,12 +102,14 @@ const getHighlights = async ({
content,
reranker,
topResults = 5,
maxContentLength = DEFAULT_MAX_CONTENT_LENGTH,
logger,
}: {
content: string;
query: string;
reranker?: BaseReranker;
topResults?: number;
maxContentLength?: number;
logger?: t.Logger;
}): Promise<t.Highlight[] | undefined> => {
const logger_ = logger || createDefaultLogger();
Expand All @@ -103,7 +124,9 @@ const getHighlights = async ({
}

try {
const documents = await chunker.splitText(content);
const documents = await chunker.splitText(
truncateContent(content, maxContentLength)
);
if (Array.isArray(documents)) {
return await reranker.rerank(query, documents, topResults);
} else {
Expand Down Expand Up @@ -457,6 +480,7 @@ export const createSourceProcessor = (
logger,
} = config;

const maxContentLength = resolveMaxContentLength(config.maxContentLength);
const logger_ = logger || createDefaultLogger();
const scraper = scraperInstance;

Expand All @@ -475,7 +499,7 @@ export const createSourceProcessor = (
url,
references,
attribution,
content: chunker.cleanText(content),
content: truncateContent(chunker.cleanText(content), maxContentLength),
};
}

Expand All @@ -498,6 +522,7 @@ export const createSourceProcessor = (
query,
reranker,
content: result.content,
maxContentLength,
logger: logger_,
});
if (onGetHighlights) {
Expand Down Expand Up @@ -606,7 +631,20 @@ export const createSourceProcessor = (
images: [],
relatedSearches: [],
};
} else if (!result.data.organic) {
}

if (
result.data.topStories != null &&
result.data.topStories.length > numElements
) {
/** Merged news results can far exceed the requested source count;
* every entry is formatted into the LLM output, so cap them up
* front β€” before any early return below and before scraping
* entries the cap would discard */
result.data.topStories = result.data.topStories.slice(0, numElements);
}

if (!result.data.organic) {
return result.data;
}

Expand Down
Loading
Loading