From 1c4246725c0c49a19d004c4a419c950ab745cf1f Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:00:37 +0800 Subject: [PATCH 1/5] modified: googlesearch/__init__.py --- googlesearch/__init__.py | 55 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 5838b02..1749861 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,4 +1,6 @@ """googlesearch is a Python library for searching Google, easily.""" +import asyncio +import httpx from time import sleep from bs4 import BeautifulSoup from requests import get @@ -6,29 +8,28 @@ from .user_agents import get_useragent -def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): - resp = get( - url="https://www.google.com/search", - headers={ - "User-Agent": get_useragent(), - "Accept": "*/*" - }, - params={ - "q": term, - "num": results + 2, # Prevents multiple requests - "hl": lang, - "start": start, - "safe": safe, - "gl": region, - }, - proxies=proxies, - timeout=timeout, - verify=ssl_verify, - cookies = { - 'CONSENT': 'PENDING+987', # Bypasses the consent page - 'SOCS': 'CAESHAgBEhIaAB', - } - ) +async def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + async with httpx.AsyncClient(proxy=proxies) as client: + resp = await client.get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + timeout=timeout, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) resp.raise_for_status() return resp @@ -43,11 +44,11 @@ def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): +async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" # Proxy setup - proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None start = start_num fetched_results = 0 # Keep track of the total fetched results @@ -55,7 +56,7 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in while fetched_results < num_results: # Send request - resp = _req(term, num_results - start, + resp = await _req(term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify, region) # put in file - comment for debugging purpose @@ -109,4 +110,4 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in break # Break the loop if no new results were found in this iteration start += 10 # Prepare for the next set of results - sleep(sleep_interval) + await asyncio.sleep(sleep_interval) From 683e77b202a07b72f7b3927440bb8250b59a6295 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:02:54 +0800 Subject: [PATCH 2/5] modified: requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 56399db..af8f10f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ beautifulsoup4>=4.9 -requests>=2.20 +httpx From 9c20eb5a0a8868cc84f2ba13f04e4f4c44ea4f88 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:06:19 +0800 Subject: [PATCH 3/5] readme --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index dd1ba4d..16f8639 100644 --- a/README.md +++ b/README.md @@ -74,3 +74,18 @@ j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=Fal for i in j: print(i) ``` + +Asyncio implementations disabled the `ssl_verify` key, which is seemingly not accepted by httpx. +A simple example: +```python +import asyncio +from googlesearch import search + +async def main(): + proxy='http://API:@proxy.host.com:8080' + r = search("hello world", advanced=True, proxy=proxy) + async for i in r: + print(i) + +r = asyncio.run(main()) +``` \ No newline at end of file From 7a852b752f0dc71355a014445bc5a8dc76cbf83b Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:16:50 +0800 Subject: [PATCH 4/5] socks --- googlesearch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 1749861..080fe9a 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -48,7 +48,7 @@ async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sl """Search the Google search engine""" # Proxy setup - proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None start = start_num fetched_results = 0 # Keep track of the total fetched results From 8b101453b81d5de2d5b5d373254220040ada929e Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Tue, 28 Jan 2025 18:47:36 +0800 Subject: [PATCH 5/5] compatible --- README.md | 4 +- googlesearch/__init__.py | 102 +++++++++++++++++++++++++++++++++++++-- requirements.txt | 1 + 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 16f8639..25815c8 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,11 @@ Asyncio implementations disabled the `ssl_verify` key, which is seemingly not ac A simple example: ```python import asyncio -from googlesearch import search +from googlesearch import asearch async def main(): proxy='http://API:@proxy.host.com:8080' - r = search("hello world", advanced=True, proxy=proxy) + r = asearch("hello world", advanced=True, proxy=proxy) async for i in r: print(i) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 080fe9a..597683e 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -8,7 +8,7 @@ from .user_agents import get_useragent -async def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): +async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): async with httpx.AsyncClient(proxy=proxies) as client: resp = await client.get( url="https://www.google.com/search", @@ -44,7 +44,7 @@ def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" -async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): +async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" # Proxy setup @@ -56,7 +56,7 @@ async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sl while fetched_results < num_results: # Send request - resp = await _req(term, num_results - start, + resp = await _areq(term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify, region) # put in file - comment for debugging purpose @@ -111,3 +111,99 @@ async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sl start += 10 # Prepare for the next set of results await asyncio.sleep(sleep_interval) + + +def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + resp = get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + proxies=proxies, + timeout=timeout, + verify=ssl_verify, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) + resp.raise_for_status() + return resp + + +def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): + """Search the Google search engine""" + + # Proxy setup + proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + + start = start_num + fetched_results = 0 # Keep track of the total fetched results + fetched_links = set() # to keep track of links that are already seen previously + + while fetched_results < num_results: + # Send request + resp = _req(term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify, region) + + # put in file - comment for debugging purpose + # with open('google.html', 'w') as f: + # f.write(resp.text) + + # Parse + soup = BeautifulSoup(resp.text, "html.parser") + result_block = soup.find_all("div", class_="ezO2md") + new_results = 0 # Keep track of new results in this iteration + + for result in result_block: + # Find the link tag within the result block + link_tag = result.find("a", href=True) + # Find the title tag within the link tag + title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None + # Find the description tag within the result block + description_tag = result.find("span", class_="FrIlee") + + # Check if all necessary tags are found + if link_tag and title_tag and description_tag: + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Check if the link has already been fetched and if unique results are required + if link in fetched_links and unique: + continue # Skip this result if the link is not unique + # Add the link to the set of fetched links + fetched_links.add(link) + # Extract the title text + title = title_tag.text if title_tag else "" + # Extract the description text + description = description_tag.text if description_tag else "" + # Increment the count of fetched results + fetched_results += 1 + # Increment the count of new results in this iteration + new_results += 1 + # Yield the result based on the advanced flag + if advanced: + yield SearchResult(link, title, description) # Yield a SearchResult object + else: + yield link # Yield only the link + + if fetched_results >= num_results: + break # Stop if we have fetched the desired number of results + + if new_results == 0: + #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + break # Break the loop if no new results were found in this iteration + + start += 10 # Prepare for the next set of results + sleep(sleep_interval) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index af8f10f..3fd37ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ beautifulsoup4>=4.9 +requests>=2.20 httpx