diff --git a/README.md b/README.md index dd1ba4d..25815c8 100644 --- a/README.md +++ b/README.md @@ -74,3 +74,18 @@ j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=Fal for i in j: print(i) ``` + +Asyncio implementations disabled the `ssl_verify` key, which is seemingly not accepted by httpx. +A simple example: +```python +import asyncio +from googlesearch import asearch + +async def main(): + proxy='http://API:@proxy.host.com:8080' + r = asearch("hello world", advanced=True, proxy=proxy) + async for i in r: + print(i) + +r = asyncio.run(main()) +``` \ No newline at end of file diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 5838b02..597683e 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,4 +1,6 @@ """googlesearch is a Python library for searching Google, easily.""" +import asyncio +import httpx from time import sleep from bs4 import BeautifulSoup from requests import get @@ -6,6 +8,111 @@ from .user_agents import get_useragent +async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + async with httpx.AsyncClient(proxy=proxies) as client: + resp = await client.get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + timeout=timeout, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) + resp.raise_for_status() + return resp + + +class SearchResult: + def __init__(self, url, title, description): + self.url = url + self.title = title + self.description = description + + def __repr__(self): + return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" + + +async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): + """Search the Google search engine""" + + # Proxy setup + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None + + start = start_num + fetched_results = 0 # Keep track of the total fetched results + fetched_links = set() # to keep track of links that are already seen previously + + while fetched_results < num_results: + # Send request + resp = await _areq(term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify, region) + + # put in file - comment for debugging purpose + # with open('google.html', 'w') as f: + # f.write(resp.text) + + # Parse + soup = BeautifulSoup(resp.text, "html.parser") + result_block = soup.find_all("div", class_="ezO2md") + new_results = 0 # Keep track of new results in this iteration + + for result in result_block: + # Find the link tag within the result block + link_tag = result.find("a", href=True) + # Find the title tag within the link tag + title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None + # Find the description tag within the result block + description_tag = result.find("span", class_="FrIlee") + + # Check if all necessary tags are found + if link_tag and title_tag and description_tag: + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Check if the link has already been fetched and if unique results are required + if link in fetched_links and unique: + continue # Skip this result if the link is not unique + # Add the link to the set of fetched links + fetched_links.add(link) + # Extract the title text + title = title_tag.text if title_tag else "" + # Extract the description text + description = description_tag.text if description_tag else "" + # Increment the count of fetched results + fetched_results += 1 + # Increment the count of new results in this iteration + new_results += 1 + # Yield the result based on the advanced flag + if advanced: + yield SearchResult(link, title, description) # Yield a SearchResult object + else: + yield link # Yield only the link + + if fetched_results >= num_results: + break # Stop if we have fetched the desired number of results + + if new_results == 0: + #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + break # Break the loop if no new results were found in this iteration + + start += 10 # Prepare for the next set of results + await asyncio.sleep(sleep_interval) + + def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): resp = get( url="https://www.google.com/search", @@ -33,16 +140,6 @@ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region) return resp -class SearchResult: - def __init__(self, url, title, description): - self.url = url - self.title = title - self.description = description - - def __repr__(self): - return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" - - def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" @@ -109,4 +206,4 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in break # Break the loop if no new results were found in this iteration start += 10 # Prepare for the next set of results - sleep(sleep_interval) + sleep(sleep_interval) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 56399db..3fd37ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ beautifulsoup4>=4.9 requests>=2.20 +httpx