Skip to content

Commit

Permalink
fix: search graph
Browse files Browse the repository at this point in the history
  • Loading branch information
VinciGit00 committed Jan 3, 2025
1 parent a9569ac commit d4b2679
Show file tree
Hide file tree
Showing 3 changed files with 16 additions and 3 deletions.
12 changes: 11 additions & 1 deletion scrapegraphai/nodes/merge_answers_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,17 @@ def execute(self, state: dict) -> dict:

merge_chain = prompt_template | self.llm_model | output_parser
answer = merge_chain.invoke({"user_prompt": user_prompt})
answer["sources"] = state.get("urls", [])

# Get the URLs from the state, ensuring we get the actual URLs used for scraping
urls = []
if "urls" in state:
urls = state["urls"]
elif "considered_urls" in state:
urls = state["considered_urls"]

# Only add sources if we actually have URLs
if urls:
answer["sources"] = urls

state.update({self.output[0]: answer})
return state
3 changes: 3 additions & 0 deletions scrapegraphai/nodes/search_internet_node.py
Original file line number Diff line number Diff line change
Expand Up @@ -99,5 +99,8 @@ def execute(self, state: dict) -> dict:
if len(answer) == 0:
raise ValueError("Zero results found for the search query.")

# Store both the URLs and considered_urls in the state
state.update({self.output[0]: answer})
state["considered_urls"] = answer # Add this as a backup

return state
4 changes: 2 additions & 2 deletions scrapegraphai/utils/research_web.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ def search_on_web(query: str, search_engine: str = "Google",
research = DuckDuckGoSearchResults(max_results=max_results)
res = research.run(query)
links = re.findall(r'https?://[^\s,\]]+', res)
return links
return links[:max_results]

elif search_engine.lower() == "bing":
headers = {
Expand All @@ -66,7 +66,7 @@ def search_on_web(query: str, search_engine: str = "Google",
response = requests.get(url, params=params)

data = response.json()
limited_results = data["results"][:max_results]
limited_results = [result['url'] for result in data["results"][:max_results]]
return limited_results

else:
Expand Down

0 comments on commit d4b2679

Please sign in to comment.