From 219b8521f93e0ed6602091a5bd55b0cf55fc65d6 Mon Sep 17 00:00:00 2001
From: AmintaCCCP <libogege@sina.com>
Date: Sat, 27 Jun 2026 22:09:26 +0800
Subject: [PATCH 1/2] feat: optimize vector search with HyDE, semantic
 reranking, and structured embeddings
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Replace fake 'AI reranking' (keyword substring matching) with true LLM-based semantic reranking following the gist reranking pattern
- Add HyDE (Hypothetical Document Embedding) query preprocessing for better recall on short/Chinese/ambiguous queries, with 5s timeout and AbortSignal cleanup
- Restructure buildEmbeddingText with field labels (Repository:, Description:, Topics:, etc.) and dedup logic
- Add lightweight keyword boost for vector search results (name/description/tag exact match bonus)
- Make search threshold, topK, HyDE toggle, and reranking toggle user-configurable in VectorSearchSettings UI
- Add EMBEDDING_FORMAT_VERSION tracking to force re-index when embedding text format changes
- Fix greedy regex in both gist and semantic reranking JSON parsing (.* → .*?)
- Fix variable shadowing of translation function t in keyword boost lambdas
- Bump default threshold from 0.3 to 0.35

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 cloudflare-worker/src/index.ts                |   2 +-
 src/components/SearchBar.tsx                  |  68 +++++++--
 .../settings/VectorSearchSettings.tsx         | 138 +++++++++++++++++-
 src/services/aiService.ts                     |  91 +++++++++++-
 src/services/vectorSearchService.ts           |  58 ++++++--
 src/store/useAppStore.ts                      |   7 +-
 src/types/index.ts                            |   7 +
 7 files changed, 336 insertions(+), 35 deletions(-)
diff --git a/cloudflare-worker/src/index.ts b/cloudflare-worker/src/index.ts
index 67aa8483..043edf9e 100644
--- a/cloudflare-worker/src/index.ts
+++ b/cloudflare-worker/src/index.ts
@@ -68,7 +68,7 @@ export default {
 
       // POST /query — 向量相似度查询
       if (request.method === 'POST' && url.pathname === '/query') {
-        const { vector, topK = 20, threshold = 0.3 } = (await request.json()) as QueryRequest;
+        const { vector, topK = 20, threshold = 0.35 } = (await request.json()) as QueryRequest;
         if (!Array.isArray(vector) || vector.length === 0) {
           return jsonResponse({ success: false, error: 'vector array required' }, 400);
         }
diff --git a/src/components/SearchBar.tsx b/src/components/SearchBar.tsx
index dc0b6c8a..a0636222 100644
--- a/src/components/SearchBar.tsx
+++ b/src/components/SearchBar.tsx
@@ -539,17 +539,63 @@ export const SearchBar: React.FC = () => {
           const embeddingClient = new EmbeddingClient(activeEmbConfig);
           const vectorService = new VectorSearchService(vsConfig);
 
-          // 1. 前端调用 Embedding API 生成查询向量
+          // 1. HyDE 查询预处理：用 LLM 生成理想仓库描述再嵌入（可选，5 秒超时降级）
+          let embeddingQuery = searchQuery;
+          const hydeConfig = aiConfigs.find(config => config.id === activeAIConfig);
+          if (vsConfig.enableHyDE !== false && hydeConfig) {
+            const hydeAbort = new AbortController();
+            let hydeTimer: ReturnType<typeof setTimeout> | null = null;
+            try {
+              setSearchPhase(t('AI 分析查询...', 'AI analyzing query...'));
+              const { AIService } = await import('../services/aiService');
+              const hydeService = new AIService(hydeConfig, language);
+              embeddingQuery = await Promise.race([
+                hydeService.generateHyDEQuery(searchQuery, hydeAbort.signal),
+                new Promise<string>((resolve) => {
+                  hydeTimer = setTimeout(() => {
+                    hydeAbort.abort();
+                    resolve(searchQuery);
+                  }, 5000);
+                }),
+              ]);
+              if (embeddingQuery !== searchQuery) {
+                console.log('🔮 HyDE generated:', embeddingQuery.slice(0, 100));
+              }
+            } catch (hydeError) {
+              console.warn('HyDE failed, using raw query:', hydeError);
+              embeddingQuery = searchQuery;
+            } finally {
+              if (hydeTimer) clearTimeout(hydeTimer);
+            }
+          }
+
+          // 2. 前端调用 Embedding API 生成查询向量
           setSearchPhase(t('生成查询向量...', 'Generating query vector...'));
-          const queryVectors = await embeddingClient.embed([searchQuery], 'query');
+          const queryVectors = await embeddingClient.embed([embeddingQuery], 'query');
           if (queryVectors && queryVectors.length > 0) {
             // 2. 前端将查询向量发送到 Worker
             setSearchPhase(t('检索向量库...', 'Searching vector index...'));
-            const vectorResults = await vectorService.query(queryVectors[0], { topK: 30, threshold: 0.3 });
+            const vectorResults = await vectorService.query(queryVectors[0], {
+              topK: vsConfig.searchTopK ?? 30,
+              threshold: vsConfig.searchThreshold ?? 0.35,
+            });
 
             if (vectorResults.length > 0) {
-              // 3. 从本地仓库数据中取出匹配结果，按相似度排序
-              const scoreMap = new Map(vectorResults.map(r => [r.id, r.score]));
+              // 3. 轻量关键词加分：精确匹配的字段给予分数微调
+              const queryLower = searchQuery.toLowerCase();
+              const boostedResults = vectorResults.map(r => {
+                let bonus = 0;
+                const name = (r.metadata.full_name || '').toLowerCase();
+                const desc = (r.metadata.description || '').toLowerCase();
+                const tags = (r.metadata.tags || []).map(tag => tag.toLowerCase());
+                if (name.includes(queryLower)) bonus += 0.05;
+                if (desc.includes(queryLower)) bonus += 0.03;
+                if (tags.some(tag => tag.includes(queryLower))) bonus += 0.02;
+                return { ...r, score: r.score + bonus };
+              });
+
+              // 4. 从本地仓库数据中取出匹配结果，按相似度排序
+              const scoreMap = new Map(boostedResults.map(r => [r.id, r.score]));
               const scoredRepos = filtered
                 .filter(repo => scoreMap.has(String(repo.id)))
                 .map(repo => ({
@@ -560,20 +606,20 @@ export const SearchBar: React.FC = () => {
                 .map(item => item.repo);
 
               if (scoredRepos.length > 0) {
-                // 4. AI 校验：用 LLM 对向量搜索结果进行二次排序
+                // 4. AI 语义重排序：用 LLM 对向量搜索结果做真正的语义排序
                 let reranked = scoredRepos;
                 let rerankSucceeded = false;
                 const rerankConfig = aiConfigs.find(config => config.id === activeAIConfig);
-                if (rerankConfig) {
+                if (rerankConfig && vsConfig.enableReranking !== false) {
                   try {
-                    setSearchPhase(t('AI 校验排序...', 'AI reranking...'));
+                    setSearchPhase(t('AI 语义重排序...', 'AI semantic reranking...'));
                     const { AIService } = await import('../services/aiService');
                     const rerankService = new AIService(rerankConfig, language);
-                    reranked = await rerankService.searchRepositoriesWithReranking(scoredRepos, searchQuery);
+                    reranked = await rerankService.searchRepositoriesWithSemanticReranking(scoredRepos, searchQuery);
                     rerankSucceeded = true;
-                    console.log('🤖 AI reranked results:', reranked.length);
+                    console.log('🤖 AI semantically reranked results:', reranked.length);
                   } catch (rerankError) {
-                    console.warn('AI reranking failed, using vector order:', rerankError);
+                    console.warn('AI semantic reranking failed, using vector order:', rerankError);
                   }
                 }
 
diff --git a/src/components/settings/VectorSearchSettings.tsx b/src/components/settings/VectorSearchSettings.tsx
index 69060678..a278abd2 100644
--- a/src/components/settings/VectorSearchSettings.tsx
+++ b/src/components/settings/VectorSearchSettings.tsx
@@ -17,6 +17,7 @@ import {
   EmbeddingClient,
   VectorSearchService,
   indexAllRepos,
+  EMBEDDING_FORMAT_VERSION,
 } from '../../services/vectorSearchService';
 import { GitHubApiService } from '../../services/githubApi';
 import type { EmbeddingApiType, EmbeddingConfig } from '../../types';
@@ -79,6 +80,12 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
   const [formIndexMode, setFormIndexMode] = useState<'description' | 'readme'>(vectorSearchConfig.indexMode || 'readme');
   const [formReadmeMaxChars, setFormReadmeMaxChars] = useState(vectorSearchConfig.readmeMaxChars || 6000);
 
+  // Search parameters state
+  const [formSearchThreshold, setFormSearchThreshold] = useState(vectorSearchConfig.searchThreshold ?? 0.35);
+  const [formSearchTopK, setFormSearchTopK] = useState(vectorSearchConfig.searchTopK ?? 30);
+  const [formEnableHyDE, setFormEnableHyDE] = useState(vectorSearchConfig.enableHyDE ?? true);
+  const [formEnableReranking, setFormEnableReranking] = useState(vectorSearchConfig.enableReranking ?? true);
+
   // Test state
   const [testingEmbedding, setTestingEmbedding] = useState(false);
   const [embeddingTestResult, setEmbeddingTestResult] = useState<{ success: boolean; dimensions: number; error?: string } | null>(null);
@@ -145,10 +152,15 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
       embeddingConfigId: activeEmbeddingConfig || '',
       indexMode: formIndexMode,
       readmeMaxChars: formReadmeMaxChars,
+      searchThreshold: formSearchThreshold,
+      searchTopK: formSearchTopK,
+      enableHyDE: formEnableHyDE,
+      enableReranking: formEnableReranking,
+      embeddingFormatVersion: EMBEDDING_FORMAT_VERSION,
     });
     setWorkerSaved(true);
     setTimeout(() => setWorkerSaved(false), 2000);
-  }, [formWorkerUrl, formAuthToken, formIndexMode, formReadmeMaxChars, activeEmbeddingConfig, setVectorSearchConfig]);
+  }, [formWorkerUrl, formAuthToken, formIndexMode, formReadmeMaxChars, formSearchThreshold, formSearchTopK, formEnableHyDE, formEnableReranking, activeEmbeddingConfig, setVectorSearchConfig]);
 
   const handleTestEmbedding = useCallback(async () => {
     setTestingEmbedding(true);
@@ -319,7 +331,7 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
     } finally {
       setAbortController(null);
     }
-  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState]);
+  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState, vectorSearchConfig.embeddingFormatVersion]);
 
   const handleIncrementalIndex = useCallback(async () => {
     const clients = createClients();
@@ -351,6 +363,8 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
         indexMode: formIndexMode,
         readmeMaxChars: formReadmeMaxChars,
         incremental: true,
+        formatVersion: vectorSearchConfig.embeddingFormatVersion,
+        currentFormatVersion: EMBEDDING_FORMAT_VERSION,
         onRepoIndexed: (repoId) => {
           stampedRepoIds.push(repoId);
           if (stampedRepoIds.length % 32 === 0) {
@@ -406,7 +420,7 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
     } finally {
       setAbortController(null);
     }
-  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState]);
+  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState, vectorSearchConfig.embeddingFormatVersion]);
 
   const handleAbortIndexing = useCallback(() => {
     abortController?.abort();
@@ -924,10 +938,120 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
         )}
       </div>
 
-      {/* Section 5: Delete Index */}
-      <div className="border border-gray-200 dark:border-gray-700 rounded-lg p-4 space-y-3">
+      {/* Section 5: Search Parameters */}
+      <div className="border border-gray-200 dark:border-gray-700 rounded-lg p-4 space-y-4">
         <h3 className="font-medium text-gray-900 dark:text-gray-100 flex items-center gap-2">
           <span className="text-xs bg-gray-200 dark:bg-gray-700 px-2 py-0.5 rounded">⑤</span>
+          {t('搜索参数', 'Search Parameters')}
+        </h3>
+
+        {/* Similarity Threshold */}
+        <div className="space-y-1">
+          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+            {t('相似度阈值', 'Similarity Threshold')}
+          </label>
+          <div className="flex items-center gap-3">
+            <input
+              type="range"
+              min={0.1}
+              max={0.8}
+              step={0.05}
+              value={formSearchThreshold}
+              onChange={(e) => setFormSearchThreshold(parseFloat(e.target.value))}
+              className="flex-1"
+            />
+            <span className="text-sm font-mono text-gray-600 dark:text-gray-400 w-12 text-right">
+              {formSearchThreshold.toFixed(2)}
+            </span>
+          </div>
+          <p className="text-xs text-gray-500 dark:text-gray-400">
+            {t('越高越严格，结果越少但更精确；越低越宽松，召回更多但可能有噪音', 'Higher = stricter, fewer but more precise results; Lower = more recall but may include noise')}
+          </p>
+        </div>
+
+        {/* Top K */}
+        <div className="space-y-1">
+          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+            {t('返回结果数 (Top K)', 'Results Count (Top K)')}
+          </label>
+          <input
+            type="number"
+            value={formSearchTopK}
+            onChange={(e) => setFormSearchTopK(Math.max(5, Math.min(50, parseInt(e.target.value) || 30)))}
+            min={5}
+            max={50}
+            className="w-full px-3 py-2 text-sm border border-gray-300 dark:border-gray-600 rounded-md bg-white dark:bg-gray-800 text-gray-900 dark:text-gray-100 focus:ring-2 focus:ring-brand-indigo focus:border-transparent"
+          />
+          <p className="text-xs text-gray-500 dark:text-gray-400">
+            {t('向量检索返回的最大结果数，越多召回越广但 LLM 重排序成本越高', 'Max results from vector search. More = wider recall but higher LLM reranking cost')}
+          </p>
+        </div>
+
+        {/* HyDE Toggle */}
+        <div className="flex items-center justify-between">
+          <div>
+            <div className="text-sm font-medium text-gray-700 dark:text-gray-300">
+              {t('HyDE 查询预处理', 'HyDE Query Preprocessing')}
+            </div>
+            <p className="text-xs text-gray-500 dark:text-gray-400">
+              {t('让 AI 生成理想仓库描述再搜索，提升短查询和中文查询的召回率', 'AI generates ideal repo description before searching, improves recall for short/Chinese queries')}
+            </p>
+          </div>
+          <button
+            onClick={() => setFormEnableHyDE(!formEnableHyDE)}
+            className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors ${
+              formEnableHyDE ? 'bg-brand-indigo' : 'bg-gray-300 dark:bg-gray-600'
+            }`}
+          >
+            <span
+              className={`inline-block h-4 w-4 transform rounded-full bg-white transition-transform ${
+                formEnableHyDE ? 'translate-x-6' : 'translate-x-1'
+              }`}
+            />
+          </button>
+        </div>
+
+        {/* Reranking Toggle */}
+        <div className="flex items-center justify-between">
+          <div>
+            <div className="text-sm font-medium text-gray-700 dark:text-gray-300">
+              {t('LLM 语义重排序', 'LLM Semantic Reranking')}
+            </div>
+            <p className="text-xs text-gray-500 dark:text-gray-400">
+              {t('用 LLM 对向量搜索结果做语义排序，显著提升排序质量', 'LLM reranks vector results by semantic relevance, significantly improves ranking quality')}
+            </p>
+          </div>
+          <button
+            onClick={() => setFormEnableReranking(!formEnableReranking)}
+            className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors ${
+              formEnableReranking ? 'bg-brand-indigo' : 'bg-gray-300 dark:bg-gray-600'
+            }`}
+          >
+            <span
+              className={`inline-block h-4 w-4 transform rounded-full bg-white transition-transform ${
+                formEnableReranking ? 'translate-x-6' : 'translate-x-1'
+              }`}
+            />
+          </button>
+        </div>
+
+        {/* Save */}
+        <button
+          onClick={handleSaveWorkerConfig}
+          className={`px-4 py-2 text-sm rounded-lg transition-colors ${
+            workerSaved
+              ? 'bg-green-500 text-white'
+              : 'bg-gray-200 dark:bg-gray-700 text-gray-700 dark:text-gray-300 hover:bg-gray-300 dark:hover:bg-gray-600'
+          }`}
+        >
+          {workerSaved ? `✓ ${t('已保存', 'Saved')}` : t('保存搜索参数', 'Save Search Parameters')}
+        </button>
+      </div>
+
+      {/* Section 6: Delete Index */}
+      <div className="border border-gray-200 dark:border-gray-700 rounded-lg p-4 space-y-3">
+        <h3 className="font-medium text-gray-900 dark:text-gray-100 flex items-center gap-2">
+          <span className="text-xs bg-gray-200 dark:bg-gray-700 px-2 py-0.5 rounded">⑥</span>
           {t('删除索引', 'Delete Index')}
         </h3>
         <p className="text-sm text-gray-500 dark:text-gray-400">
@@ -961,14 +1085,14 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
         </p>
       </div>
 
-      {/* Section 6: Deploy Guide */}
+      {/* Section 7: Deploy Guide */}
       <div className="border border-gray-200 dark:border-gray-700 rounded-lg overflow-hidden">
         <button
           onClick={() => setShowDeployGuide(!showDeployGuide)}
           className="w-full flex items-center justify-between p-4 hover:bg-gray-50 dark:hover:bg-gray-800/50 transition-colors"
         >
           <h3 className="font-medium text-gray-900 dark:text-gray-100 flex items-center gap-2">
-            <span className="text-xs bg-gray-200 dark:bg-gray-700 px-2 py-0.5 rounded">⑥</span>
+            <span className="text-xs bg-gray-200 dark:bg-gray-700 px-2 py-0.5 rounded">⑦</span>
             {t('部署指南', 'Deploy Guide')}
           </h3>
           {showDeployGuide ? <ChevronDown className="w-4 h-4 text-gray-500" /> : <ChevronRight className="w-4 h-4 text-gray-500" />}
diff --git a/src/services/aiService.ts b/src/services/aiService.ts
index 0d7cb8ed..f1e4406f 100644
--- a/src/services/aiService.ts
+++ b/src/services/aiService.ts
@@ -624,7 +624,7 @@ AI Summary: ${gist.ai_summary || 'None'}`;
     });
 
     try {
-      const jsonMatch = content.match(/\[[\s\S]*\]/);
+      const jsonMatch = content.match(/\[[\s\S]*?\]/);
       const ids = JSON.parse(jsonMatch ? jsonMatch[0] : content);
       if (!Array.isArray(ids)) return gists;
       const gistById = new Map(gists.map(gist => [gist.id, gist]));
@@ -646,6 +646,70 @@ AI Summary: ${gist.ai_summary || 'None'}`;
     }
   }
 
+  /**
+   * 对仓库列表做真正的语义重排序（参照 gist 重排序模式）
+   * 将候选仓库摘要发送给 LLM，让其按相关性排序返回 ID 列表
+   * @param repositories 候选仓库列表（通常是向量搜索的 top-K 结果）
+   * @param query 用户搜索查询
+   * @returns 按语义相关性排序的仓库列表
+   */
+  async searchRepositoriesWithSemanticReranking(repositories: Repository[], query: string): Promise<Repository[]> {
+    if (repositories.length === 0) return [];
+
+    // 限制候选数量，控制 token 消耗
+    // 注意：searchTopK 配置与此上限独立；若 searchTopK > 50，超出部分不会被重排序
+    const candidates = repositories.slice(0, 50);
+
+    const repoSummaries = candidates.map((repo, index) => {
+      const stars = repo.stargazers_count >= 1000
+        ? `${(repo.stargazers_count / 1000).toFixed(0)}k`
+        : String(repo.stargazers_count || 0);
+      const tags = (repo.ai_tags || []).slice(0, 5).join(', ');
+      const desc = (repo.ai_summary || repo.description || '').slice(0, 150);
+      const parts = [`${index + 1}. ID: ${repo.id} | ${repo.full_name}`];
+      if (desc) parts.push(`   ${desc}`);
+      const meta = [repo.language, `★${stars}`];
+      if (tags) meta.push(`Tags: ${tags}`);
+      parts.push(`   ${meta.join(' | ')}`);
+      return parts.join('\n');
+    }).join('\n\n');
+
+    const system = this.language === 'zh'
+      ? '你是 GitHub 仓库搜索排序助手。根据用户查询，从候选仓库中选出最相关的，按相关性从高到低返回 ID 数组 JSON。只输出 JSON 数组，不要输出额外文字。'
+      : 'You are a GitHub repository search reranking assistant. Given a user query and candidate repositories, return a JSON array of repository IDs ordered from most to least relevant. Output only the JSON array, no extra text.';
+
+    const content = await this.requestText({
+      system,
+      user: `Query: ${query}\n\nRepositories:\n${this.sanitizeForPrompt(repoSummaries)}`,
+      temperature: 0.1,
+      maxTokens: 800,
+    });
+
+    try {
+      const jsonMatch = content.match(/\[[\s\S]*?\]/);
+      const ids = JSON.parse(jsonMatch ? jsonMatch[0] : content);
+      if (!Array.isArray(ids)) return repositories;
+
+      const repoById = new Map(repositories.map(r => [String(r.id), r]));
+      const seen = new Set<string>();
+      const ranked = ids
+        .map((id: unknown) => String(id))
+        .filter(id => {
+          if (seen.has(id)) return false;
+          seen.add(id);
+          return true;
+        })
+        .map(id => repoById.get(id))
+        .filter((r): r is Repository => !!r);
+      const rankedIds = new Set(ranked.map(r => r.id));
+      // 未被 LLM 排到的仓库追加到末尾（保留原始顺序）
+      return [...ranked, ...repositories.filter(r => !rankedIds.has(r.id))];
+    } catch (error) {
+      logger.warn('ai', 'Failed to parse semantic reranking result', error);
+      return repositories;
+    }
+  }
+
   private createAnalysisRetryPrompt(originalPrompt: string, previousContent: string, invalidReason: string): string {
     const previousOutput = this.sanitizeForPrompt(previousContent).slice(0, 1200);
 
@@ -1091,6 +1155,31 @@ ${repoInfo}
     return this.performBasicSearch(repositories, query);
   }
 
+  /**
+   * HyDE (Hypothetical Document Embedding) 查询预处理
+   * 根据用户查询生成一个"理想仓库描述"，用该描述生成向量而非原始查询
+   * 对短查询、中文查询、概念查询效果显著提升
+   * @param query 用户原始查询
+   * @param signal 可选 AbortSignal
+   * @returns 生成的理想仓库描述（用于向量嵌入）
+   */
+  async generateHyDEQuery(query: string, signal?: AbortSignal): Promise<string> {
+    const system = this.language === 'zh'
+      ? '你是一个搜索助手。根据用户的搜索意图，生成一段 2-3 句话的理想 GitHub 仓库描述，包含相关技术术语、编程语言和使用场景。只输出描述文本，不要输出其他内容。'
+      : 'You are a search assistant. Given a user search query, generate a 2-3 sentence description of the ideal GitHub repository that would perfectly match this query. Include relevant technical terms, programming languages, and use cases. Output only the description, no extra text.';
+
+    const content = await this.requestText({
+      system,
+      user: `Search query: "${query}"`,
+      temperature: 0.3,
+      maxTokens: 200,
+      signal,
+    });
+
+    // 清理可能的引号或多余空白
+    return content.replace(/^["']|["']$/g, '').trim() || query;
+  }
+
   /**
    * Search repositories using AI semantic search with fallback to enhanced basic search.
    * Attempts to call the configured AI service to parse search intent and extract
diff --git a/src/services/vectorSearchService.ts b/src/services/vectorSearchService.ts
index cb31f5cf..deb69a02 100644
--- a/src/services/vectorSearchService.ts
+++ b/src/services/vectorSearchService.ts
@@ -257,7 +257,7 @@ export class VectorSearchService {
     options: { topK?: number; threshold?: number } = {},
     signal?: AbortSignal,
   ): Promise<VectorQueryResult[]> {
-    const { topK = 20, threshold = 0.3 } = options;
+    const { topK = 20, threshold = 0.35 } = options;
     const result = await this.request<{ matches: VectorQueryResult[] }>('/query', {
       method: 'POST',
       body: JSON.stringify({ vector, topK, threshold }),
@@ -318,6 +318,13 @@ export class VectorSearchService {
 // 工具函数
 // ============================================================
 
+/**
+ * 嵌入文本格式版本。
+ * buildEmbeddingText 的输出格式变化时必须递增，
+ * 使增量索引能检测到格式变化并强制重新索引所有向量。
+ */
+export const EMBEDDING_FORMAT_VERSION = 2;
+
 /**
  * 拼接仓库文本用于 embedding
  * @param repo 仓库数据
@@ -325,18 +332,33 @@ export class VectorSearchService {
  * @param maxChars README 最大字符数，默认 6000
  */
 export function buildEmbeddingText(repo: Repository, readmeContent?: string, maxChars = 6000): string {
-  const parts = [
-    repo.full_name,
-    repo.description || '',
-    repo.custom_description || '',
-    repo.ai_summary || '',
-    (repo.topics || []).join(', '),
-    (repo.ai_tags || []).join(', '),
-    (repo.custom_tags || []).join(', '),
-    repo.language || '',
-  ];
+  const parts: string[] = [];
+
+  // 结构化字段标签，帮助 embedding 模型理解字段角色和权重
+  if (repo.full_name) parts.push(`Repository: ${repo.full_name}`);
+
+  // 去重：description 和 ai_summary 内容重叠时，跳过较短的 description
+  const description = repo.description || '';
+  const aiSummary = repo.ai_summary || '';
+  const customDesc = repo.custom_description || '';
+
+  if (description && !aiSummary.includes(description)) {
+    parts.push(`Description: ${description}`);
+  }
+  if (customDesc) parts.push(`About: ${customDesc}`);
+  if (aiSummary) parts.push(`Summary: ${aiSummary}`);
+
+  // 合并 topics 和 tags，去重
+  const allTopics = [...new Set([
+    ...(repo.topics || []),
+    ...(repo.ai_tags || []),
+    ...(repo.custom_tags || []),
+  ])];
+  if (allTopics.length > 0) parts.push(`Topics: ${allTopics.join(', ')}`);
+
+  if (repo.language) parts.push(`Language: ${repo.language}`);
+
   // README 内容提供最丰富的语义信息
-  // 跳过常见的装饰性徽章/图片头部
   if (readmeContent) {
     const cleaned = readmeContent
       .replace(/\[!\[.*?\]\(.*?\)\]\(.*?\)/g, '') // 移除链接徽章 [![...](...)](...) — 必须在图片之前
@@ -345,9 +367,9 @@ export function buildEmbeddingText(repo: Repository, readmeContent?: string, max
       .replace(/\n{3,}/g, '\n\n') // 压缩多余空行
       .trim();
     const truncated = cleaned.slice(0, maxChars);
-    if (truncated) parts.push(truncated);
+    if (truncated) parts.push(`README:\n${truncated}`);
   }
-  return parts.filter(Boolean).join('\n');
+  return parts.join('\n');
 }
 
 /**
@@ -469,6 +491,10 @@ export async function indexAllRepos(
     readmeMaxChars?: number;
     incremental?: boolean;
     onRepoIndexed?: (repoId: number) => void;
+    /** 当前存储的格式版本号 */
+    formatVersion?: number;
+    /** 最新格式版本号（EMBEDDING_FORMAT_VERSION） */
+    currentFormatVersion?: number;
   } = {}
 ): Promise<{ indexed: number; skipped: number; errors: number; error?: string; indexedRepoIds: number[] }> {
   const { batchSize = 32, onProgress, signal, readmeFetcher, indexMode = 'readme', readmeMaxChars = 6000, incremental, onRepoIndexed } = options;
@@ -481,8 +507,12 @@ export async function indexAllRepos(
   let indexable = repos.filter((r) => r.analyzed_at && !r.analysis_failed);
   // 增量模式下跳过已索引且内容未更新的仓库
   if (incremental) {
+    // 嵌入文本格式版本变化时，强制重新索引所有向量以避免混合格式
+    const formatVersionChanged = options.formatVersion !== undefined && options.currentFormatVersion !== undefined
+      && options.formatVersion < options.currentFormatVersion;
     indexable = indexable.filter((r) => {
       if (!r.vector_indexed_at) return true; // 从未索引
+      if (formatVersionChanged) return true; // 格式版本升级，需要重新索引
       // 取 last_edited 与 analyzed_at 中较新者作为内容时间，更新后需要重新索引
       const contentTime = [r.last_edited, r.analyzed_at]
         .filter((t): t is string => !!t)
diff --git a/src/store/useAppStore.ts b/src/store/useAppStore.ts
index ec72b779..00bbf71e 100644
--- a/src/store/useAppStore.ts
+++ b/src/store/useAppStore.ts
@@ -2314,12 +2314,17 @@ export const useAppStore = create<AppState & AppActions>()(
   if (state && !(state as Record<string, unknown>).vectorSearchConfig) {
     (state as Record<string, unknown>).vectorSearchConfig = { enabled: false, workerUrl: '', authToken: '', embeddingConfigId: '', indexMode: 'readme', readmeMaxChars: 6000 };
   }
-  // 迁移：为旧配置添加 indexMode 和 readmeMaxChars
+  // 迁移：为旧配置添加缺失字段
   if (state) {
     const vsc = (state as Record<string, unknown>).vectorSearchConfig as Record<string, unknown> | undefined;
     if (vsc && typeof vsc === 'object') {
       if (vsc.indexMode !== 'description' && vsc.indexMode !== 'readme') vsc.indexMode = 'readme';
       if (typeof vsc.readmeMaxChars !== 'number' || vsc.readmeMaxChars <= 0) vsc.readmeMaxChars = 6000;
+      if (typeof vsc.searchThreshold !== 'number') vsc.searchThreshold = 0.35;
+      if (typeof vsc.searchTopK !== 'number') vsc.searchTopK = 30;
+      if (typeof vsc.enableHyDE !== 'boolean') vsc.enableHyDE = true;
+      if (typeof vsc.enableReranking !== 'boolean') vsc.enableReranking = true;
+      if (typeof vsc.embeddingFormatVersion !== 'number') vsc.embeddingFormatVersion = 1;
     }
   }
 
diff --git a/src/types/index.ts b/src/types/index.ts
index 91e05ded..a925ce4d 100644
--- a/src/types/index.ts
+++ b/src/types/index.ts
@@ -235,6 +235,13 @@ export interface VectorSearchConfig {
   embeddingConfigId: string;
   indexMode: VectorIndexMode;
   readmeMaxChars: number;  // README 截取字符数，默认 6000
+  // 搜索参数（可选，有默认值）
+  searchThreshold?: number;   // 相似度阈值，默认 0.35
+  searchTopK?: number;        // 返回结果数，默认 30
+  enableHyDE?: boolean;       // 是否启用 HyDE 查询预处理，默认 true
+  enableReranking?: boolean;  // 是否启用 LLM 语义重排序，默认 true
+  // 嵌入文本格式版本，buildEmbeddingText 格式变化时递增
+  embeddingFormatVersion?: number;
 }
 
 export interface VectorSearchStatus {

From 6dbfb7f515932e5c5515a183a6ba4bf3e4ecd65c Mon Sep 17 00:00:00 2001
From: AmintaCCCP <libogege@sina.com>
Date: Sat, 27 Jun 2026 22:44:44 +0800
Subject: [PATCH 2/2] fix: address Gemini Code Assist and CodeRabbit review
 findings

- Fix HyDE AbortError: catch abort on generateHyDEQuery to prevent unhandled rejection
- Fix r.metadata null safety: use optional chaining for keyword boost
- Fix formatVersionChanged: default missing version to v1 so older vectors get reindexed
- Fix applyFilters overwriting LLM rerank order: preserve rerankOrder map and reapply after filtering
- Fix signal forwarding: pass AbortSignal through semantic reranking to requestText
- Fix incremental index button: keep enabled when format version needs upgrade
- Fix embeddingFormatVersion not persisted: update store after both rebuild and incremental index
- Add accessibility: htmlFor/id for inputs, role/aria-checked for toggle switches

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 src/components/SearchBar.tsx                  | 21 +++++++++++-----
 .../settings/VectorSearchSettings.tsx         | 25 +++++++++++++++----
 src/services/aiService.ts                     |  3 ++-
 src/services/vectorSearchService.ts           |  4 +--
 4 files changed, 39 insertions(+), 14 deletions(-)

diff --git a/src/components/SearchBar.tsx b/src/components/SearchBar.tsx
index a0636222..f02782d0 100644
--- a/src/components/SearchBar.tsx
+++ b/src/components/SearchBar.tsx
@@ -550,7 +550,7 @@ export const SearchBar: React.FC = () => {
               const { AIService } = await import('../services/aiService');
               const hydeService = new AIService(hydeConfig, language);
               embeddingQuery = await Promise.race([
-                hydeService.generateHyDEQuery(searchQuery, hydeAbort.signal),
+                hydeService.generateHyDEQuery(searchQuery, hydeAbort.signal).catch(() => searchQuery),
                 new Promise<string>((resolve) => {
                   hydeTimer = setTimeout(() => {
                     hydeAbort.abort();
@@ -585,9 +585,9 @@ export const SearchBar: React.FC = () => {
               const queryLower = searchQuery.toLowerCase();
               const boostedResults = vectorResults.map(r => {
                 let bonus = 0;
-                const name = (r.metadata.full_name || '').toLowerCase();
-                const desc = (r.metadata.description || '').toLowerCase();
-                const tags = (r.metadata.tags || []).map(tag => tag.toLowerCase());
+                const name = (r.metadata?.full_name || '').toLowerCase();
+                const desc = (r.metadata?.description || '').toLowerCase();
+                const tags = (r.metadata?.tags || []).map(tag => tag.toLowerCase());
                 if (name.includes(queryLower)) bonus += 0.05;
                 if (desc.includes(queryLower)) bonus += 0.03;
                 if (tags.some(tag => tag.includes(queryLower))) bonus += 0.02;
@@ -623,9 +623,18 @@ export const SearchBar: React.FC = () => {
                   }
                 }
 
-                // If AI reranking succeeded, preserve its order; otherwise sort by vector score
+                // 保存 LLM 重排序顺序，applyFilters 可能按 UI 排序覆盖它
+                const rerankOrder = rerankSucceeded
+                  ? new Map(reranked.map((repo, index) => [String(repo.id), index]))
+                  : null;
                 const finalFiltered = applyFilters([...reranked]);
-                if (!rerankSucceeded) {
+                if (rerankOrder) {
+                  // 恢复 LLM 语义排序顺序
+                  finalFiltered.sort((a, b) =>
+                    (rerankOrder.get(String(a.id)) ?? Number.MAX_SAFE_INTEGER)
+                    - (rerankOrder.get(String(b.id)) ?? Number.MAX_SAFE_INTEGER)
+                  );
+                } else {
                   finalFiltered.sort((a, b) => (scoreMap.get(String(b.id)) ?? 0) - (scoreMap.get(String(a.id)) ?? 0));
                 }
                 console.log('🎯 Vector search results:', finalFiltered.length);
diff --git a/src/components/settings/VectorSearchSettings.tsx b/src/components/settings/VectorSearchSettings.tsx
index a278abd2..2350457e 100644
--- a/src/components/settings/VectorSearchSettings.tsx
+++ b/src/components/settings/VectorSearchSettings.tsx
@@ -232,6 +232,9 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
     return contentTime > r.vector_indexed_at;
   }).length;
 
+  // 嵌入文本格式版本升级时，即使无内容更新也需要触发增量索引来重建所有向量
+  const formatVersionNeedsReindex = (vectorSearchConfig.embeddingFormatVersion ?? 1) < EMBEDDING_FORMAT_VERSION;
+
   const createClients = useCallback(() => {
     if (!activeConfig) return null;
     const embeddingClient = new EmbeddingClient({
@@ -319,6 +322,8 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
         dimensions: formDimensions,
         lastSyncAt: new Date().toISOString(),
       });
+      // 索引成功后更新格式版本号，避免下次增量索引重复触发全量重建
+      setVectorSearchConfig({ embeddingFormatVersion: EMBEDDING_FORMAT_VERSION });
     } catch (err) {
       if (err instanceof Error && err.message === 'Aborted') {
         setVectorIndexingState({ isIndexing: false, phase: null, result: null });
@@ -331,7 +336,7 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
     } finally {
       setAbortController(null);
     }
-  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState, vectorSearchConfig.embeddingFormatVersion]);
+  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState, vectorSearchConfig.embeddingFormatVersion, setVectorSearchConfig]);
 
   const handleIncrementalIndex = useCallback(async () => {
     const clients = createClients();
@@ -395,6 +400,8 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
         // 状态更新失败不应回滚已成功的索引结果
         console.warn('Failed to update vector search status:', statusErr);
       }
+      // 索引成功后更新格式版本号，避免下次增量索引重复触发全量重建
+      setVectorSearchConfig({ embeddingFormatVersion: EMBEDDING_FORMAT_VERSION });
     } catch (err) {
       if (err instanceof Error && err.message === 'Aborted') {
         setVectorIndexingState({ isIndexing: false, phase: null, result: null });
@@ -420,7 +427,7 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
     } finally {
       setAbortController(null);
     }
-  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState, vectorSearchConfig.embeddingFormatVersion]);
+  }, [createClients, formIndexMode, formReadmeMaxChars, formDimensions, updateRepositoriesMetadata, setVectorSearchStatus, setVectorIndexingState, vectorSearchConfig.embeddingFormatVersion, setVectorSearchConfig]);
 
   const handleAbortIndexing = useCallback(() => {
     abortController?.abort();
@@ -878,7 +885,7 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
           </button>
           <button
             onClick={handleIncrementalIndex}
-            disabled={isIndexing || !isConfigComplete || unindexedCount === 0}
+            disabled={isIndexing || !isConfigComplete || (unindexedCount === 0 && !formatVersionNeedsReindex)}
             className="flex items-center gap-2 px-4 py-2 text-sm bg-gray-200 dark:bg-gray-700 text-gray-700 dark:text-gray-300 rounded-lg hover:bg-gray-300 dark:hover:bg-gray-600 disabled:opacity-50 disabled:cursor-not-allowed"
           >
             {isIndexing ? <Loader2 className="w-4 h-4 animate-spin" /> : <RefreshCw className="w-4 h-4" />}
@@ -947,11 +954,12 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
 
         {/* Similarity Threshold */}
         <div className="space-y-1">
-          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+          <label htmlFor="search-threshold" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
             {t('相似度阈值', 'Similarity Threshold')}
           </label>
           <div className="flex items-center gap-3">
             <input
+              id="search-threshold"
               type="range"
               min={0.1}
               max={0.8}
@@ -971,10 +979,11 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
 
         {/* Top K */}
         <div className="space-y-1">
-          <label className="block text-sm font-medium text-gray-700 dark:text-gray-300">
+          <label htmlFor="search-topk" className="block text-sm font-medium text-gray-700 dark:text-gray-300">
             {t('返回结果数 (Top K)', 'Results Count (Top K)')}
           </label>
           <input
+            id="search-topk"
             type="number"
             value={formSearchTopK}
             onChange={(e) => setFormSearchTopK(Math.max(5, Math.min(50, parseInt(e.target.value) || 30)))}
@@ -998,6 +1007,9 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
             </p>
           </div>
           <button
+            role="switch"
+            aria-checked={formEnableHyDE}
+            aria-label={t('HyDE 查询预处理', 'HyDE Query Preprocessing')}
             onClick={() => setFormEnableHyDE(!formEnableHyDE)}
             className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors ${
               formEnableHyDE ? 'bg-brand-indigo' : 'bg-gray-300 dark:bg-gray-600'
@@ -1022,6 +1034,9 @@ export const VectorSearchSettings: React.FC<VectorSearchSettingsProps> = ({ t })
             </p>
           </div>
           <button
+            role="switch"
+            aria-checked={formEnableReranking}
+            aria-label={t('LLM 语义重排序', 'LLM Semantic Reranking')}
             onClick={() => setFormEnableReranking(!formEnableReranking)}
             className={`relative inline-flex h-6 w-11 items-center rounded-full transition-colors ${
               formEnableReranking ? 'bg-brand-indigo' : 'bg-gray-300 dark:bg-gray-600'
diff --git a/src/services/aiService.ts b/src/services/aiService.ts
index f1e4406f..d1f2d844 100644
--- a/src/services/aiService.ts
+++ b/src/services/aiService.ts
@@ -653,7 +653,7 @@ AI Summary: ${gist.ai_summary || 'None'}`;
    * @param query 用户搜索查询
    * @returns 按语义相关性排序的仓库列表
    */
-  async searchRepositoriesWithSemanticReranking(repositories: Repository[], query: string): Promise<Repository[]> {
+  async searchRepositoriesWithSemanticReranking(repositories: Repository[], query: string, signal?: AbortSignal): Promise<Repository[]> {
     if (repositories.length === 0) return [];
 
     // 限制候选数量，控制 token 消耗
@@ -683,6 +683,7 @@ AI Summary: ${gist.ai_summary || 'None'}`;
       user: `Query: ${query}\n\nRepositories:\n${this.sanitizeForPrompt(repoSummaries)}`,
       temperature: 0.1,
       maxTokens: 800,
+      signal,
     });
 
     try {
diff --git a/src/services/vectorSearchService.ts b/src/services/vectorSearchService.ts
index deb69a02..37135aaa 100644
--- a/src/services/vectorSearchService.ts
+++ b/src/services/vectorSearchService.ts
@@ -508,8 +508,8 @@ export async function indexAllRepos(
   // 增量模式下跳过已索引且内容未更新的仓库
   if (incremental) {
     // 嵌入文本格式版本变化时，强制重新索引所有向量以避免混合格式
-    const formatVersionChanged = options.formatVersion !== undefined && options.currentFormatVersion !== undefined
-      && options.formatVersion < options.currentFormatVersion;
+    // 缺失版本号视为 v1（旧格式），仍需触发升级
+    const formatVersionChanged = (options.formatVersion ?? 1) < (options.currentFormatVersion ?? EMBEDDING_FORMAT_VERSION);
     indexable = indexable.filter((r) => {
       if (!r.vector_indexed_at) return true; // 从未索引
       if (formatVersionChanged) return true; // 格式版本升级，需要重新索引