Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Asyncio support #100

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 15 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,3 +74,18 @@ j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=Fal
for i in j:
print(i)
```

Asyncio implementations disabled the `ssl_verify` key, which is seemingly not accepted by httpx.
A simple example:
```python
import asyncio
from googlesearch import asearch

async def main():
proxy='http://API:@proxy.host.com:8080'
r = asearch("hello world", advanced=True, proxy=proxy)
async for i in r:
print(i)

r = asyncio.run(main())
```
119 changes: 108 additions & 11 deletions googlesearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,118 @@
"""googlesearch is a Python library for searching Google, easily."""
import asyncio
import httpx
from time import sleep
from bs4 import BeautifulSoup
from requests import get
from urllib.parse import unquote # to decode the url
from .user_agents import get_useragent


async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
async with httpx.AsyncClient(proxy=proxies) as client:
resp = await client.get(
url="https://www.google.com/search",
headers={
"User-Agent": get_useragent(),
"Accept": "*/*"
},
params={
"q": term,
"num": results + 2, # Prevents multiple requests
"hl": lang,
"start": start,
"safe": safe,
"gl": region,
},
timeout=timeout,
cookies = {
'CONSENT': 'PENDING+987', # Bypasses the consent page
'SOCS': 'CAESHAgBEhIaAB',
}
)
resp.raise_for_status()
return resp


class SearchResult:
def __init__(self, url, title, description):
self.url = url
self.title = title
self.description = description

def __repr__(self):
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"


async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
"""Search the Google search engine"""

# Proxy setup
proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None

start = start_num
fetched_results = 0 # Keep track of the total fetched results
fetched_links = set() # to keep track of links that are already seen previously

while fetched_results < num_results:
# Send request
resp = await _areq(term, num_results - start,
lang, start, proxies, timeout, safe, ssl_verify, region)

# put in file - comment for debugging purpose
# with open('google.html', 'w') as f:
# f.write(resp.text)

# Parse
soup = BeautifulSoup(resp.text, "html.parser")
result_block = soup.find_all("div", class_="ezO2md")
new_results = 0 # Keep track of new results in this iteration

for result in result_block:
# Find the link tag within the result block
link_tag = result.find("a", href=True)
# Find the title tag within the link tag
title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None
# Find the description tag within the result block
description_tag = result.find("span", class_="FrIlee")

# Check if all necessary tags are found
if link_tag and title_tag and description_tag:
# Extract and decode the link URL
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else ""
# Extract and decode the link URL
link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else ""
# Check if the link has already been fetched and if unique results are required
if link in fetched_links and unique:
continue # Skip this result if the link is not unique
# Add the link to the set of fetched links
fetched_links.add(link)
# Extract the title text
title = title_tag.text if title_tag else ""
# Extract the description text
description = description_tag.text if description_tag else ""
# Increment the count of fetched results
fetched_results += 1
# Increment the count of new results in this iteration
new_results += 1
# Yield the result based on the advanced flag
if advanced:
yield SearchResult(link, title, description) # Yield a SearchResult object
else:
yield link # Yield only the link

if fetched_results >= num_results:
break # Stop if we have fetched the desired number of results

if new_results == 0:
#If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below:
#print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.")
break # Break the loop if no new results were found in this iteration

start += 10 # Prepare for the next set of results
await asyncio.sleep(sleep_interval)


def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region):
resp = get(
url="https://www.google.com/search",
Expand Down Expand Up @@ -33,16 +140,6 @@ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region)
return resp


class SearchResult:
def __init__(self, url, title, description):
self.url = url
self.title = title
self.description = description

def __repr__(self):
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"


def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False):
"""Search the Google search engine"""

Expand Down Expand Up @@ -109,4 +206,4 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in
break # Break the loop if no new results were found in this iteration

start += 10 # Prepare for the next set of results
sleep(sleep_interval)
sleep(sleep_interval)
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
beautifulsoup4>=4.9
requests>=2.20
httpx