diff --git a/OPTIMIZATIONS.md b/OPTIMIZATIONS.md new file mode 100644 index 0000000..13f267c --- /dev/null +++ b/OPTIMIZATIONS.md @@ -0,0 +1,134 @@ +# DeepSearch 搜尋邏輯與UI優化 + +## 概述 + +本次優化主要針對兩個方面: +1. **智能搜尋邏輯** - 從固定搜尋深度改為AI驅動的適應性搜尋 +2. **用戶界面改善** - 將所有功能整合到單一頁面,提升用戶體驗 + +## 🚀 主要新功能 + +### 1. 適應性搜尋 (Adaptive Search) + +#### 原有問題 +- 搜尋深度固定,無法根據結果質量動態調整 +- 可能進行不必要的重複搜尋 +- 無法根據實際需求停止搜尋 + +#### 新解決方案 +- **智能決策**: LLM分析每次搜尋結果,決定是否繼續 +- **JSON控制**: AI輸出結構化決策,包含原因和信心度 +- **動態停止**: 根據覆蓋度、質量和收益遞減自動停止 + +#### 技術實現 +```python +def adaptive_deep_search(self, topic: str, keywords: List[str], max_iterations: int = 5) +``` + +#### AI決策標準 +1. **覆蓋度** - 是否獲得主題的全面信息 +2. **質量** - 結果是否相關且高質量 +3. **收益遞減** - 新搜尋是否提供顯著不同的信息 +4. **完整性** - 是否有足夠信息回答研究問題 + +#### JSON輸出格式 +```json +{ + "continue": true/false, + "reason": "簡短解釋決策原因", + "confidence": 1-10, + "suggested_focus": ["建議的下一個關注領域"] +} +``` + +### 2. 搜尋控制日誌 + +新增完整的搜尋決策追蹤: +- 每次迭代的決策記錄 +- AI信心度趨勢分析 +- 搜尋停止原因統計 +- JSON格式導出,便於分析 + +### 3. 用戶界面優化 + +#### 原有問題 +- 功能分散在多個頁面 +- 缺乏即時狀態反饋 +- 用戶體驗不夠流暢 + +#### 新界面特色 +- **單頁面設計** - 所有功能在同一頁面 +- **響應式布局** - 左側控制面板,右側結果展示 +- **即時狀態** - 實時狀態指示器 +- **視覺改善** - 現代化設計與圖標 +- **智能配置** - 適應性搜尋作為推薦選項 + +#### 新增UI元素 +- 🔍 DeepSearch標題與圖標 +- ⚙️ 集中式設置面板 +- 🤖 適應性搜尋開關 +- 📊 實時進度展示 +- 📥 結果下載區域 + +## 🔧 配置選項 + +### 傳統固定搜尋 +- **廣度**: 1-8 平行搜尋路徑 +- **深度**: 1-5 迭代搜尋次數 + +### 智能適應性搜尋 (推薦) +- **最大迭代**: 2-10 次 +- **自動停止**: AI決定何時停止 +- **決策透明**: 完整的決策日誌 + +## 📁 輸出文件 + +研究完成後會生成: +1. `report.md` - 最終研究報告 +2. `search_logs.json` - 完整搜尋日誌 +3. `search_control_log_*.json` - AI決策記錄 (新增) +4. `content_quality_report.md` - 內容質量分析 +5. `extracted_webpages/` - 提取的網頁內容 + +## 🎯 使用建議 + +### 新用戶 +1. 啟用 "Use Adaptive Search" (推薦) +2. 設置最大迭代數為 5 +3. 保持 "Extract Full Webpage Content" 開啟 + +### 高級用戶 +- 可根據需求調整最大迭代數 +- 關注搜尋控制日誌以了解AI決策過程 +- 比較適應性搜尋與固定搜尋的效果 + +## 🔬 技術細節 + +### 適應性搜尋流程 +1. **初始搜尋** - 基於用戶問題生成關鍵字 +2. **結果分析** - LLM評估搜尋結果質量 +3. **決策制定** - AI決定是否繼續搜尋 +4. **關鍵字精煉** - 如果繼續,生成新的搜尋詞 +5. **迭代重複** - 直到AI決定停止或達到最大次數 + +### fallback機制 +- JSON解析失敗時的文本分析 +- 網絡錯誤時的啟發式決策 +- 確保系統穩定運行 + +## 🚀 性能優化 + +- 減少不必要的搜尋迭代 +- 智能關鍵字生成 +- 更高效的結果篩選 +- 改善的用戶界面響應性 + +## 🔄 向後兼容 + +- 保留原有的固定深度搜尋選項 +- 所有現有功能依然可用 +- 平滑的遷移路徑 + +--- + +*這些優化讓DeepSearch更智能、更高效、更易用,同時為用戶提供了更好的控制和透明度。* \ No newline at end of file diff --git a/src/gradio_interface.py b/src/gradio_interface.py index a5b1f7a..6683391 100644 --- a/src/gradio_interface.py +++ b/src/gradio_interface.py @@ -79,6 +79,8 @@ def generate_questions(topic): try: CURRENT_TOPIC = topic + # Store topic in agent's log data + AGENT.log_data["topic"] = topic redirect_output = io.StringIO() with redirect_stdout(redirect_output): CURRENT_QUESTIONS = AGENT.generate_initial_questions(topic) @@ -143,7 +145,7 @@ def process_answers(answer1, answer2, answer3): return f"Error processing answers: {str(e)}\n\nTraceback: {traceback.format_exc()}" -def perform_research(breadth, depth, extract_content, progress=gr.Progress()): +def perform_research(breadth, depth, extract_content, use_adaptive_search, max_iterations, progress=gr.Progress()): global AGENT, CURRENT_TOPIC, CURRENT_ANSWERS, CURRENT_SEARCH_RESULTS if not AGENT or not CURRENT_TOPIC or not CURRENT_ANSWERS: @@ -172,8 +174,14 @@ def perform_research(breadth, depth, extract_content, progress=gr.Progress()): keyword_display += f"{i}. {display_keyword}\n" progress(0.2, "Starting deep search...") - CURRENT_SEARCH_RESULTS = AGENT.deep_search( - CURRENT_TOPIC, keywords, depth, extract_content=extract_content) + + # Use adaptive search if enabled + if use_adaptive_search: + CURRENT_SEARCH_RESULTS = AGENT.adaptive_deep_search( + CURRENT_TOPIC, keywords, max_iterations, extract_content=extract_content) + else: + CURRENT_SEARCH_RESULTS = AGENT.deep_search( + CURRENT_TOPIC, keywords, depth, extract_content=extract_content) # Restore stdout sys.stdout = old_stdout @@ -380,11 +388,32 @@ def generate_final_report(progress=gr.Progress()): # Build Gradio Interface - def create_interface(): custom_css = """ #header {text-align: center; margin-bottom: 1rem;} - .gradio-container {max-width: 900px; margin: 0 auto;} + .gradio-container {max-width: 1200px; margin: 0 auto;} + .section-header { + background: linear-gradient(90deg, #667eea 0%, #764ba2 100%); + color: white; + padding: 10px; + border-radius: 8px; + margin: 10px 0; + text-align: center; + font-weight: bold; + } + .status-box { + background-color: #f8f9fa; + border: 1px solid #dee2e6; + border-radius: 6px; + padding: 10px; + margin: 5px 0; + } + .research-controls { + background-color: #f1f3f4; + padding: 15px; + border-radius: 8px; + margin: 10px 0; + } """ theme = gr.themes.Soft( @@ -394,241 +423,331 @@ def create_interface(): ) with gr.Blocks(title="DeepSearch Research Agent", css=custom_css, theme=theme) as demo: - gr.Markdown("# DeepSearch Research Agent", elem_id="header") + # Header + gr.Markdown("# 🔍 DeepSearch Research Agent", elem_id="header") gr.Markdown( - "This web interface allows you to perform deep research on any topic using advanced search techniques and AI assistance.") - - gr.Markdown("## 1. Initialize") + "**Advanced AI-powered research tool with adaptive search capabilities**\n" + "This tool performs comprehensive research using intelligent search strategies that adapt based on results." + ) + + # Status indicator with gr.Row(): with gr.Column(): - provider_dropdown = gr.Dropdown( - choices=PROVIDERS, - label="AI Provider", - value="ollama" - ) - model_textbox = gr.Textbox( - label="Model Name (leave empty for default)", - placeholder="e.g., deepseek-r1, gpt-4o" - ) - # Add Ollama host configuration for Docker - ollama_host_textbox = gr.Textbox( - label="Ollama Host (only for Ollama provider)", - placeholder="e.g., host.docker.internal:11434", - value=OLLAMA_HOST - ) - - def update_ollama_host(host): - global OLLAMA_HOST - if host and host.strip(): - OLLAMA_HOST = host.strip() - return f"Ollama host set to: {OLLAMA_HOST}" - - ollama_host_textbox.change( - update_ollama_host, - inputs=[ollama_host_textbox], - outputs=[] - ) - - init_button = gr.Button("Initialize Agent") - with gr.Column(): - init_output = gr.Textbox( - label="Initialization Status", interactive=False - ) - - gr.Markdown("## 2. Define Research Topic") + status_display = gr.Markdown("**Status:** Ready to start", elem_classes=["status-box"]) + + # Main content in tabs for better organization but all on same page with gr.Row(): - with gr.Column(): + # Left column - Setup and Controls + with gr.Column(scale=1): + gr.Markdown("### 🚀 Setup", elem_classes=["section-header"]) + + # AI Provider setup + with gr.Group(): + provider_dropdown = gr.Dropdown( + choices=PROVIDERS, + label="AI Provider", + value="ollama" + ) + model_textbox = gr.Textbox( + label="Model Name (optional)", + placeholder="e.g., deepseek-r1, gpt-4o", + scale=2 + ) + ollama_host_textbox = gr.Textbox( + label="Ollama Host (Docker only)", + placeholder="host.docker.internal:11434", + value=OLLAMA_HOST, + visible=True + ) + init_button = gr.Button("🔧 Initialize Agent", variant="primary") + init_output = gr.Textbox( + label="Initialization Status", + interactive=False, + max_lines=2 + ) + + # Research Topic + gr.Markdown("### 📋 Research Topic", elem_classes=["section-header"]) + with gr.Group(): topic_textbox = gr.Textbox( label="Research Topic", - placeholder="Enter the topic you want to research" + placeholder="Enter your research topic here...", + lines=2 ) - generate_questions_button = gr.Button("Generate Questions") - - # System output (can be hidden if desired) - questions_output = gr.Textbox( - label="System Output", interactive=False, visible=True) - - # Three fixed question-answer blocks - with gr.Row(visible=True) as questions_container: - with gr.Column(): - # Question 1 - question1_label = gr.Markdown("Question 1", visible=False) - question1_text = gr.Markdown("", visible=False) - answer1 = gr.Textbox( - label="Your Answer", - placeholder="Type your answer here", - visible=False, - interactive=True + generate_questions_button = gr.Button("📝 Generate Focus Questions", variant="secondary") + + # Research Configuration + gr.Markdown("### ⚙️ Research Settings", elem_classes=["section-header"]) + with gr.Group(elem_classes=["research-controls"]): + with gr.Row(): + breadth_slider = gr.Slider( + minimum=1, + maximum=8, + value=3, + step=1, + label="Search Breadth", + info="Number of parallel search paths" + ) + depth_slider = gr.Slider( + minimum=1, + maximum=5, + value=2, + step=1, + label="Fixed Depth", + info="For traditional search mode" + ) + + extract_content_checkbox = gr.Checkbox( + label="📄 Extract Full Webpage Content", + value=True, + info="Deeper analysis but slower" ) - - # Question 2 - question2_label = gr.Markdown("Question 2", visible=False) - question2_text = gr.Markdown("", visible=False) - answer2 = gr.Textbox( - label="Your Answer", - placeholder="Type your answer here", - visible=False, - interactive=True + + use_adaptive_search_checkbox = gr.Checkbox( + label="🤖 Use Adaptive Search (Recommended)", + value=True, + info="AI decides when to stop searching" ) - + + max_iterations_slider = gr.Slider( + minimum=2, + maximum=10, + value=5, + step=1, + label="Max Iterations", + info="Maximum search rounds for adaptive mode" + ) + + research_button = gr.Button("🔬 Start Research", variant="primary", size="lg") + generate_report_button = gr.Button("📊 Generate Final Report", variant="primary", size="lg") + + # Right column - Results and Interaction + with gr.Column(scale=2): + # Questions and Answers Section + gr.Markdown("### 💭 Focus Questions", elem_classes=["section-header"]) + + questions_output = gr.Textbox( + label="Question Generation Status", + interactive=False, + visible=True, + max_lines=3 + ) + + # Dynamic Questions Display + with gr.Group(visible=True) as questions_container: + # Question 1 + with gr.Group(visible=False) as question1_group: + question1_text = gr.Markdown("", visible=False) + answer1 = gr.Textbox( + label="Your Answer", + placeholder="Type your answer here...", + visible=False, + interactive=True, + lines=2 + ) + + # Question 2 + with gr.Group(visible=False) as question2_group: + question2_text = gr.Markdown("", visible=False) + answer2 = gr.Textbox( + label="Your Answer", + placeholder="Type your answer here...", + visible=False, + interactive=True, + lines=2 + ) + # Question 3 - question3_label = gr.Markdown("Question 3", visible=False) - question3_text = gr.Markdown("", visible=False) - answer3 = gr.Textbox( - label="Your Answer", - placeholder="Type your answer here", - visible=False, - interactive=True + with gr.Group(visible=False) as question3_group: + question3_text = gr.Markdown("", visible=False) + answer3 = gr.Textbox( + label="Your Answer", + placeholder="Type your answer here...", + visible=False, + interactive=True, + lines=2 + ) + + process_answers_button = gr.Button("✅ Submit Answers", visible=False, variant="secondary") + answers_output = gr.Textbox( + label="Answer Processing Status", + interactive=False, + max_lines=2 ) - - # Process answers button - process_answers_button = gr.Button("Submit Answers", visible=False) - answers_output = gr.Textbox( - label="Status", interactive=False) - - # Store questions in state - questions_state = gr.State([]) - - # Function to update the questions display - def update_questions_display(questions_data, output_text): - if not questions_data or len(questions_data) == 0: - return [ - gr.update(visible=False), gr.update( - visible=False), gr.update(visible=False), - gr.update(visible=False), gr.update( - visible=False), gr.update(visible=False), - gr.update(visible=False), gr.update( - visible=False), gr.update(visible=False), - gr.update(visible=False), questions_data - ] - - # Make sure we have up to 3 questions - questions = questions_data[:3] - while len(questions) < 3: - questions.append({"question": "", "options": []}) - - # Get the first 3 questions - q1 = questions[0] - q2 = questions[1] - q3 = questions[2] - - # Format questions with options - def format_question_with_options(q): - if not q["question"]: - return "" - - question_text = q["question"] - - if q["options"]: - options_text = "\n\n" - for i, option in enumerate(q["options"]): - options_text += f"{chr(97+i)}) {option}\n" - question_text += options_text - - return question_text - - q1_text = format_question_with_options(q1) - q2_text = format_question_with_options(q2) - q3_text = format_question_with_options(q3) - - # Determine visibility based on if questions exist - q1_visible = bool(q1["question"]) - q2_visible = bool(q2["question"]) - q3_visible = bool(q3["question"]) - - # Show the process button if we have at least one question - process_visible = q1_visible - + + # Research Progress Section + gr.Markdown("### 🔬 Research Progress", elem_classes=["section-header"]) + + with gr.Row(): + with gr.Column(): + research_output = gr.Textbox( + label="Live Research Log", + interactive=False, + max_lines=15, + info="Real-time search progress and AI decisions" + ) + with gr.Column(): + keywords_output = gr.Textbox( + label="Generated Keywords", + interactive=False, + max_lines=8, + info="Search terms being used" + ) + + # Final Report Section + gr.Markdown("### 📑 Research Report", elem_classes=["section-header"]) + + with gr.Row(): + with gr.Column(scale=3): + report_markdown = gr.Markdown( + "Report will appear here after research is complete...", + label="Final Research Report" + ) + with gr.Column(scale=1): + report_status = gr.Textbox( + label="Report Status", + interactive=False, + max_lines=3 + ) + report_download = gr.File( + label="📥 Download Results", + visible=False + ) + + # Store questions in state + questions_state = gr.State([]) + + # Enhanced question display function + def update_questions_display(questions_data, output_text): + if not questions_data or len(questions_data) == 0: return [ - gr.update(visible=q1_visible), gr.update( - value=q1_text, visible=q1_visible), gr.update(visible=q1_visible, interactive=True), - gr.update(visible=q2_visible), gr.update( - value=q2_text, visible=q2_visible), gr.update(visible=q2_visible, interactive=True), - gr.update(visible=q3_visible), gr.update( - value=q3_text, visible=q3_visible), gr.update(visible=q3_visible, interactive=True), - gr.update(visible=process_visible), questions_data + gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), + gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), + gr.update(visible=False), gr.update(visible=False), gr.update(visible=False), + gr.update(visible=False), questions_data ] - gr.Markdown("## 3. Perform Research") - with gr.Row(): - with gr.Column(): - breadth_slider = gr.Slider( - minimum=1, - maximum=8, - value=3, - step=1, - label="Research Breadth (number of parallel search paths)" - ) - depth_slider = gr.Slider( - minimum=1, - maximum=5, - value=2, - step=1, - label="Research Depth (number of iterative searches)" - ) - extract_content_checkbox = gr.Checkbox( - label="Extract webpage content", - value=True - ) - research_button = gr.Button("Conduct Research") - with gr.Column(): - research_output = gr.Textbox( - label="Research Progress", interactive=False, max_lines=20 - ) - keywords_output = gr.Textbox( - label="Generated Keywords", interactive=False - ) + # Ensure we have up to 3 questions + questions = questions_data[:3] + while len(questions) < 3: + questions.append({"question": "", "options": []}) + + # Format questions with better styling + def format_question_with_options(q, num): + if not q["question"]: + return "" + + question_text = f"**Question {num}:** {q['question']}" + + if q["options"]: + options_text = "\n\n**Options:**\n" + for i, option in enumerate(q["options"]): + options_text += f"**{chr(97+i)}.** {option}\n" + question_text += options_text + + return question_text + + q1_text = format_question_with_options(questions[0], 1) + q2_text = format_question_with_options(questions[1], 2) + q3_text = format_question_with_options(questions[2], 3) + + # Determine visibility + q1_visible = bool(questions[0]["question"]) + q2_visible = bool(questions[1]["question"]) + q3_visible = bool(questions[2]["question"]) + process_visible = q1_visible + + return [ + gr.update(visible=q1_visible), gr.update(value=q1_text, visible=q1_visible), gr.update(visible=q1_visible), + gr.update(visible=q2_visible), gr.update(value=q2_text, visible=q2_visible), gr.update(visible=q2_visible), + gr.update(visible=q3_visible), gr.update(value=q3_text, visible=q3_visible), gr.update(visible=q3_visible), + gr.update(visible=process_visible), questions_data + ] + + # Enhanced status updates + def update_status(message): + return f"**Status:** {message}" + + # Event handlers with status updates + def init_agent_with_status(provider, model): + success, message = initialize_agent(provider, model) + status = "✅ Agent Ready" if success else "❌ Initialization Failed" + return message, update_status(status) + + def generate_questions_with_status(topic): + if not topic.strip(): + return "Please enter a research topic first", [], update_status("⚠️ Topic Required") + + output, questions = generate_questions(topic) + status = "✅ Questions Generated" if questions else "❌ Question Generation Failed" + return output, questions, update_status(status) + + def process_answers_with_status(answer1, answer2, answer3): + result = process_answers(answer1, answer2, answer3) + if "Successfully recorded" in result: + status = "✅ Answers Recorded" + else: + status = "❌ Answer Processing Failed" + return result, update_status(status) + + def perform_research_with_status(breadth, depth, extract_content, use_adaptive_search, max_iterations): + max_iter = int(max_iterations) + + output, keywords = perform_research(breadth, depth, extract_content, use_adaptive_search, max_iter) + if "Error" not in output: + status = "✅ Research Complete" + else: + status = "❌ Research Failed" + return output, keywords, update_status(status) + + def generate_report_with_status(): + report, download, status_msg = generate_final_report() + if download: + status = "✅ Report Generated" + download_visible = True + else: + status = "❌ Report Generation Failed" + download_visible = False + return report, download if download_visible else None, status_msg, update_status(status), gr.update(visible=download_visible) - gr.Markdown("## 4. Generate Report") - generate_report_button = gr.Button("Generate Final Report") - with gr.Row(): - with gr.Column(scale=2): - report_markdown = gr.Markdown( - label="Final Research Report") - with gr.Column(scale=1): - report_status = gr.Textbox( - label="Report Status", interactive=False) - report_download = gr.File(label="Download Results") - - # Connect components + # Connect all components init_button.click( - initialize_agent, + init_agent_with_status, inputs=[provider_dropdown, model_textbox], - outputs=[init_output] + outputs=[init_output, status_display] ) - # Update the question display workflow with proper state management generate_questions_button.click( - generate_questions, + generate_questions_with_status, inputs=[topic_textbox], - outputs=[questions_output, questions_state] + outputs=[questions_output, questions_state, status_display] ).then( update_questions_display, inputs=[questions_state, questions_output], outputs=[ - question1_label, question1_text, answer1, - question2_label, question2_text, answer2, - question3_label, question3_text, answer3, + question1_group, question1_text, answer1, + question2_group, question2_text, answer2, + question3_group, question3_text, answer3, process_answers_button, questions_state ] ) process_answers_button.click( - process_answers, + process_answers_with_status, inputs=[answer1, answer2, answer3], - outputs=[answers_output] + outputs=[answers_output, status_display] ) research_button.click( - perform_research, - inputs=[breadth_slider, depth_slider, extract_content_checkbox], - outputs=[research_output, keywords_output] + perform_research_with_status, + inputs=[breadth_slider, depth_slider, extract_content_checkbox, use_adaptive_search_checkbox, max_iterations_slider], + outputs=[research_output, keywords_output, status_display] ) generate_report_button.click( - generate_final_report, + generate_report_with_status, inputs=[], - outputs=[report_markdown, report_download, report_status] + outputs=[report_markdown, report_download, report_status, status_display, report_download] ) return demo diff --git a/src/search.py b/src/search.py index 454a776..c81bf43 100644 --- a/src/search.py +++ b/src/search.py @@ -42,7 +42,8 @@ def __init__(self, ai_provider: str = "ollama", model: str = "deepseek-r1", olla "answers": [], "keywords": [], "report": "", - "webpage_contents": [] # Store webpage markdown content + "webpage_contents": [], # Store webpage markdown content + "search_controls": [] # Store adaptive search control decisions } # Initialize WebsiteToMarkdown converter @@ -422,6 +423,112 @@ def deep_search(self, topic: str, keywords: List[str], depth: int, extract_conte return all_results + def adaptive_deep_search(self, topic: str, keywords: List[str], max_iterations: int = 5, extract_content: bool = True) -> List[Dict[str, Any]]: + """Perform adaptive deep search where LLM decides when to stop based on search results""" + import time + import random + + all_results = [] + current_keywords = keywords.copy() + iteration = 0 + search_control = {"continue": True, "reason": "", "confidence": 0} + + console.print(f"[bold magenta]Starting adaptive research on topic: {topic}[/]") + console.print(f"[dim]Initial keywords: {len(current_keywords)}[/dim]") + + while search_control["continue"] and iteration < max_iterations: + iteration += 1 + console.print(f"[bold magenta]Research iteration {iteration}[/]") + + iteration_results = [] + + # Search for each current keyword + for i, keyword in enumerate(current_keywords): + # Extract the keyword from the search_words tags if present + search_words_pattern = r'(.*?)' + search_words_match = re.search(search_words_pattern, keyword) + + if search_words_match: + display_keyword = search_words_match.group(1).strip() + search_query = f"{topic} {display_keyword}" + else: + display_keyword = keyword + search_query = f"{topic} {display_keyword}" + + console.print(f" [bold]Searching {i+1}/{len(current_keywords)}:[/] {display_keyword}") + + with Progress( + SpinnerColumn(), + TextColumn(f"[bold yellow]Searching...[/]"), + transient=True, + ) as progress: + progress.add_task("searching", total=None) + results = search(search_query, limit=10) + + if results['success']: + console.print(f" [green]Found {len(results['data'])} results[/]") + + # If extract_content is True, fetch and convert webpage content to markdown + if extract_content: + self._extract_webpage_content( + results['data'], topic, display_keyword, iteration-1) + + iteration_results.extend(results['data']) + + # Store search results + self.log_data["search_results"].extend([ + { + "keyword": display_keyword, + "search_query": search_query, + "iteration": iteration, + "results": results['data'] + } + ]) + else: + console.print(f" [red]Error: {results['error']}[/red]") + + # Add random sleep between searches + if i < len(current_keywords) - 1: + sleep_time = random.uniform(1, 3) + time.sleep(sleep_time) + + all_results.extend(iteration_results) + + # After each iteration, let LLM analyze results and decide next steps + console.print("[bold cyan]Analyzing search results and planning next steps...[/]") + search_control = self._analyze_search_results_and_plan( + topic, all_results, iteration, max_iterations) + + # Store the control decision + self.log_data["search_controls"] = self.log_data.get("search_controls", []) + self.log_data["search_controls"].append({ + "iteration": iteration, + "control": search_control, + "total_results": len(all_results) + }) + + # Display the decision + if search_control["continue"]: + console.print(f"[bold green]✓ Continuing research[/]: {search_control['reason']}") + console.print(f"[dim]Confidence: {search_control['confidence']}/10[/dim]") + + # Generate new keywords for next iteration + if iteration < max_iterations: + new_keywords = self._generate_refined_keywords( + topic, all_results, len(current_keywords)) + current_keywords = new_keywords + + # Add sleep between iterations + sleep_time = random.uniform(2, 5) + time.sleep(sleep_time) + else: + console.print(f"[bold red]⏹ Stopping research[/]: {search_control['reason']}") + console.print(f"[dim]Confidence: {search_control['confidence']}/10[/dim]") + break + + console.print(f"[bold magenta]Research completed after {iteration} iterations with {len(all_results)} total results[/]") + return all_results + def _extract_webpage_content(self, search_results: List[Dict[str, Any]], topic: str, keyword: str, iteration: int): """Extract content from webpages found in search results""" import time @@ -587,6 +694,103 @@ def _generate_refined_keywords(self, topic: str, search_results: List[Dict[str, return keywords + def _analyze_search_results_and_plan(self, topic: str, all_results: List[Dict], iteration: int, max_iterations: int) -> Dict[str, Any]: + """Use LLM to analyze search results and decide whether to continue searching""" + + # Prepare a summary of current results for analysis + results_summary = self._prepare_results_summary(all_results, limit=20) + + prompt = f"""Analyze the search results for the topic "{topic}" and decide whether to continue searching. + +Current Status: +- Iteration: {iteration}/{max_iterations} +- Total results found: {len(all_results)} +- Recent results summary: {results_summary} + +Your task is to evaluate: +1. Coverage: Are we getting comprehensive information about the topic? +2. Quality: Are the results relevant and high-quality? +3. Diminishing Returns: Are new searches providing significantly different information? +4. Completeness: Do we have enough information to answer the research questions thoroughly? + +Respond with a JSON object containing: +{{ + "continue": true/false, + "reason": "Brief explanation of the decision (max 100 characters)", + "confidence": 1-10, + "suggested_focus": ["area1", "area2"] // If continuing, what areas to focus on next +}} + +Guidelines: +- Continue if: coverage is incomplete, results are highly relevant, or new valuable information is being found +- Stop if: topic is well-covered, results are becoming repetitive, or quality is declining significantly +- Consider the iteration number - don't continue unnecessarily but ensure thorough coverage +""" + + try: + response = self._call_llm(prompt) + + # Try to extract JSON from the response + json_match = re.search(r'\{[^}]*\}', response, re.DOTALL) + if json_match: + try: + control_decision = json.loads(json_match.group()) + + # Validate the structure + if all(key in control_decision for key in ["continue", "reason", "confidence"]): + # Ensure boolean and int types + control_decision["continue"] = bool(control_decision["continue"]) + control_decision["confidence"] = int(control_decision.get("confidence", 5)) + control_decision["confidence"] = max(1, min(10, control_decision["confidence"])) + + return control_decision + except json.JSONDecodeError: + pass + + # Fallback: analyze response text for continue/stop keywords + response_lower = response.lower() + if any(word in response_lower for word in ["continue", "more", "keep", "additional"]): + return { + "continue": True, + "reason": "LLM suggested continuing (text analysis)", + "confidence": 6 + } + else: + return { + "continue": False, + "reason": "LLM suggested stopping (text analysis)", + "confidence": 6 + } + + except Exception as e: + console.print(f"[yellow]Error in LLM analysis: {str(e)}. Using heuristic decision.[/yellow]") + + # Fallback heuristic decision + if iteration >= max_iterations: + return {"continue": False, "reason": "Maximum iterations reached", "confidence": 10} + elif len(all_results) < 5: + return {"continue": True, "reason": "Insufficient results found", "confidence": 8} + elif iteration == 1: + return {"continue": True, "reason": "First iteration completed", "confidence": 7} + else: + return {"continue": False, "reason": "Heuristic stopping point", "confidence": 5} + + def _prepare_results_summary(self, results: List[Dict], limit: int = 20) -> str: + """Prepare a concise summary of search results for LLM analysis""" + if not results: + return "No results found" + + # Take the most recent results up to the limit + recent_results = results[-limit:] if len(results) > limit else results + + summary_items = [] + for i, result in enumerate(recent_results): + title = result.get('title', 'No title')[:60] + url_domain = result.get('url', '').split('/')[2] if result.get('url') else 'unknown' + summary_items.append(f"{i+1}. {title}... ({url_domain})") + + return "\n".join(summary_items) + def _extract_content_between_backticks(self, text: str) -> str: """Extract content between triple backticks, if exists""" pattern = r"```(?:markdown)?\n(.*?)```" @@ -865,6 +1069,10 @@ def save_results(self, topic: str, output_dir: str): log_path = output_path / "search_logs.json" with open(log_path, "w") as f: json.dump(self.log_data, f, indent=2) + + # Save search control decisions if adaptive search was used + if self.log_data.get("search_controls"): + self.export_search_control_log(str(output_path)) # Save content quality report quality_report_path = output_path / "content_quality_report.md" @@ -915,6 +1123,66 @@ def save_results(self, topic: str, output_dir: str): console.print(f"[bold green]Results saved to:[/] {output_path}") return str(output_path) + def export_search_control_log(self, output_dir: str) -> str: + """Export the search control decisions to a JSON file""" + import json + from pathlib import Path + + # Create output directory if it doesn't exist + Path(output_dir).mkdir(parents=True, exist_ok=True) + + # Prepare the control log data + control_log = { + "research_topic": self.log_data.get("topic", "Unknown"), + "search_method": "adaptive_search" if self.log_data.get("search_controls") else "fixed_search", + "total_iterations": len(self.log_data.get("search_controls", [])), + "total_results_found": len(self.log_data.get("search_results", [])), + "decisions": self.log_data.get("search_controls", []), + "summary": self._generate_search_control_summary() + } + + # Save to JSON file + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + filename = f"search_control_log_{timestamp}.json" + filepath = os.path.join(output_dir, filename) + + with open(filepath, 'w', encoding='utf-8') as f: + json.dump(control_log, f, indent=2, ensure_ascii=False) + + console.print(f"[green]Search control log exported to: {filepath}[/green]") + return filepath + + def _generate_search_control_summary(self) -> Dict[str, Any]: + """Generate a summary of search control decisions""" + controls = self.log_data.get("search_controls", []) + + if not controls: + return {"message": "No adaptive search control data available"} + + # Analyze the decisions + total_iterations = len(controls) + stopped_early = False + stop_reason = "Maximum iterations reached" + + if controls: + last_decision = controls[-1]["control"] + if not last_decision.get("continue", True): + stopped_early = True + stop_reason = last_decision.get("reason", "AI decided to stop") + + # Calculate average confidence + confidences = [control["control"].get("confidence", 5) for control in controls] + avg_confidence = sum(confidences) / len(confidences) if confidences else 0 + + return { + "total_iterations": total_iterations, + "stopped_early": stopped_early, + "stop_reason": stop_reason, + "average_confidence": round(avg_confidence, 2), + "confidence_trend": confidences, + "results_per_iteration": [control["total_results"] for control in controls] + } + def close(self): """Close the WebsiteToMarkdown converter""" if hasattr(self, 'markdown_converter'):