From 1c4246725c0c49a19d004c4a419c950ab745cf1f Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:00:37 +0800 Subject: [PATCH 1/9] modified: googlesearch/__init__.py --- googlesearch/__init__.py | 55 ++++++++++++++++++++-------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 5838b02..1749861 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,4 +1,6 @@ """googlesearch is a Python library for searching Google, easily.""" +import asyncio +import httpx from time import sleep from bs4 import BeautifulSoup from requests import get @@ -6,29 +8,28 @@ from .user_agents import get_useragent -def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): - resp = get( - url="https://www.google.com/search", - headers={ - "User-Agent": get_useragent(), - "Accept": "*/*" - }, - params={ - "q": term, - "num": results + 2, # Prevents multiple requests - "hl": lang, - "start": start, - "safe": safe, - "gl": region, - }, - proxies=proxies, - timeout=timeout, - verify=ssl_verify, - cookies = { - 'CONSENT': 'PENDING+987', # Bypasses the consent page - 'SOCS': 'CAESHAgBEhIaAB', - } - ) +async def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + async with httpx.AsyncClient(proxy=proxies) as client: + resp = await client.get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + timeout=timeout, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) resp.raise_for_status() return resp @@ -43,11 +44,11 @@ def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): +async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" # Proxy setup - proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None start = start_num fetched_results = 0 # Keep track of the total fetched results @@ -55,7 +56,7 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in while fetched_results < num_results: # Send request - resp = _req(term, num_results - start, + resp = await _req(term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify, region) # put in file - comment for debugging purpose @@ -109,4 +110,4 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in break # Break the loop if no new results were found in this iteration start += 10 # Prepare for the next set of results - sleep(sleep_interval) + await asyncio.sleep(sleep_interval) From 683e77b202a07b72f7b3927440bb8250b59a6295 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:02:54 +0800 Subject: [PATCH 2/9] modified: requirements.txt --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 56399db..af8f10f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,2 @@ beautifulsoup4>=4.9 -requests>=2.20 +httpx From 9c20eb5a0a8868cc84f2ba13f04e4f4c44ea4f88 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:06:19 +0800 Subject: [PATCH 3/9] readme --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index dd1ba4d..16f8639 100644 --- a/README.md +++ b/README.md @@ -74,3 +74,18 @@ j = search("proxy test", num_results=100, lang="en", proxy=proxy, ssl_verify=Fal for i in j: print(i) ``` + +Asyncio implementations disabled the `ssl_verify` key, which is seemingly not accepted by httpx. +A simple example: +```python +import asyncio +from googlesearch import search + +async def main(): + proxy='http://API:@proxy.host.com:8080' + r = search("hello world", advanced=True, proxy=proxy) + async for i in r: + print(i) + +r = asyncio.run(main()) +``` \ No newline at end of file From 7a852b752f0dc71355a014445bc5a8dc76cbf83b Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sun, 26 Jan 2025 21:16:50 +0800 Subject: [PATCH 4/9] socks --- googlesearch/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 1749861..080fe9a 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -48,7 +48,7 @@ async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sl """Search the Google search engine""" # Proxy setup - proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None start = start_num fetched_results = 0 # Keep track of the total fetched results From 8b101453b81d5de2d5b5d373254220040ada929e Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Tue, 28 Jan 2025 18:47:36 +0800 Subject: [PATCH 5/9] compatible --- README.md | 4 +- googlesearch/__init__.py | 102 +++++++++++++++++++++++++++++++++++++-- requirements.txt | 1 + 3 files changed, 102 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 16f8639..25815c8 100644 --- a/README.md +++ b/README.md @@ -79,11 +79,11 @@ Asyncio implementations disabled the `ssl_verify` key, which is seemingly not ac A simple example: ```python import asyncio -from googlesearch import search +from googlesearch import asearch async def main(): proxy='http://API:@proxy.host.com:8080' - r = search("hello world", advanced=True, proxy=proxy) + r = asearch("hello world", advanced=True, proxy=proxy) async for i in r: print(i) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 080fe9a..597683e 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -8,7 +8,7 @@ from .user_agents import get_useragent -async def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): +async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): async with httpx.AsyncClient(proxy=proxies) as client: resp = await client.get( url="https://www.google.com/search", @@ -44,7 +44,7 @@ def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" -async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): +async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" # Proxy setup @@ -56,7 +56,7 @@ async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sl while fetched_results < num_results: # Send request - resp = await _req(term, num_results - start, + resp = await _areq(term, num_results - start, lang, start, proxies, timeout, safe, ssl_verify, region) # put in file - comment for debugging purpose @@ -111,3 +111,99 @@ async def search(term, num_results=10, lang="en", proxy=None, advanced=False, sl start += 10 # Prepare for the next set of results await asyncio.sleep(sleep_interval) + + +def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + resp = get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + proxies=proxies, + timeout=timeout, + verify=ssl_verify, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) + resp.raise_for_status() + return resp + + +def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): + """Search the Google search engine""" + + # Proxy setup + proxies = {"https": proxy, "http": proxy} if proxy and (proxy.startswith("https") or proxy.startswith("http")) else None + + start = start_num + fetched_results = 0 # Keep track of the total fetched results + fetched_links = set() # to keep track of links that are already seen previously + + while fetched_results < num_results: + # Send request + resp = _req(term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify, region) + + # put in file - comment for debugging purpose + # with open('google.html', 'w') as f: + # f.write(resp.text) + + # Parse + soup = BeautifulSoup(resp.text, "html.parser") + result_block = soup.find_all("div", class_="ezO2md") + new_results = 0 # Keep track of new results in this iteration + + for result in result_block: + # Find the link tag within the result block + link_tag = result.find("a", href=True) + # Find the title tag within the link tag + title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None + # Find the description tag within the result block + description_tag = result.find("span", class_="FrIlee") + + # Check if all necessary tags are found + if link_tag and title_tag and description_tag: + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Check if the link has already been fetched and if unique results are required + if link in fetched_links and unique: + continue # Skip this result if the link is not unique + # Add the link to the set of fetched links + fetched_links.add(link) + # Extract the title text + title = title_tag.text if title_tag else "" + # Extract the description text + description = description_tag.text if description_tag else "" + # Increment the count of fetched results + fetched_results += 1 + # Increment the count of new results in this iteration + new_results += 1 + # Yield the result based on the advanced flag + if advanced: + yield SearchResult(link, title, description) # Yield a SearchResult object + else: + yield link # Yield only the link + + if fetched_results >= num_results: + break # Stop if we have fetched the desired number of results + + if new_results == 0: + #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + break # Break the loop if no new results were found in this iteration + + start += 10 # Prepare for the next set of results + sleep(sleep_interval) \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index af8f10f..3fd37ca 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ beautifulsoup4>=4.9 +requests>=2.20 httpx From 390b13e666c61793e57e9117a57426a22c9918b9 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Mon, 2 Jun 2025 14:30:19 +0800 Subject: [PATCH 6/9] optimize --- googlesearch/__init__.py | 110 +-------------------------------------- googlesearch/asearch.py | 101 +++++++++++++++++++++++++++++++++++ googlesearch/includes.py | 8 +++ 3 files changed, 111 insertions(+), 108 deletions(-) create mode 100644 googlesearch/asearch.py create mode 100644 googlesearch/includes.py diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 597683e..d52ab1d 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,117 +1,12 @@ """googlesearch is a Python library for searching Google, easily.""" import asyncio -import httpx from time import sleep from bs4 import BeautifulSoup from requests import get from urllib.parse import unquote # to decode the url from .user_agents import get_useragent - - -async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): - async with httpx.AsyncClient(proxy=proxies) as client: - resp = await client.get( - url="https://www.google.com/search", - headers={ - "User-Agent": get_useragent(), - "Accept": "*/*" - }, - params={ - "q": term, - "num": results + 2, # Prevents multiple requests - "hl": lang, - "start": start, - "safe": safe, - "gl": region, - }, - timeout=timeout, - cookies = { - 'CONSENT': 'PENDING+987', # Bypasses the consent page - 'SOCS': 'CAESHAgBEhIaAB', - } - ) - resp.raise_for_status() - return resp - - -class SearchResult: - def __init__(self, url, title, description): - self.url = url - self.title = title - self.description = description - - def __repr__(self): - return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" - - -async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): - """Search the Google search engine""" - - # Proxy setup - proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None - - start = start_num - fetched_results = 0 # Keep track of the total fetched results - fetched_links = set() # to keep track of links that are already seen previously - - while fetched_results < num_results: - # Send request - resp = await _areq(term, num_results - start, - lang, start, proxies, timeout, safe, ssl_verify, region) - - # put in file - comment for debugging purpose - # with open('google.html', 'w') as f: - # f.write(resp.text) - - # Parse - soup = BeautifulSoup(resp.text, "html.parser") - result_block = soup.find_all("div", class_="ezO2md") - new_results = 0 # Keep track of new results in this iteration - - for result in result_block: - # Find the link tag within the result block - link_tag = result.find("a", href=True) - # Find the title tag within the link tag - title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None - # Find the description tag within the result block - description_tag = result.find("span", class_="FrIlee") - - # Check if all necessary tags are found - if link_tag and title_tag and description_tag: - # Extract and decode the link URL - link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" - # Extract and decode the link URL - link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" - # Check if the link has already been fetched and if unique results are required - if link in fetched_links and unique: - continue # Skip this result if the link is not unique - # Add the link to the set of fetched links - fetched_links.add(link) - # Extract the title text - title = title_tag.text if title_tag else "" - # Extract the description text - description = description_tag.text if description_tag else "" - # Increment the count of fetched results - fetched_results += 1 - # Increment the count of new results in this iteration - new_results += 1 - # Yield the result based on the advanced flag - if advanced: - yield SearchResult(link, title, description) # Yield a SearchResult object - else: - yield link # Yield only the link - - if fetched_results >= num_results: - break # Stop if we have fetched the desired number of results - - if new_results == 0: - #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: - #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") - break # Break the loop if no new results were found in this iteration - - start += 10 # Prepare for the next set of results - await asyncio.sleep(sleep_interval) - +from .includes import SearchResult +from .asearch import asearch def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): resp = get( @@ -139,7 +34,6 @@ def _req(term, results, lang, start, proxies, timeout, safe, ssl_verify, region) resp.raise_for_status() return resp - def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): """Search the Google search engine""" diff --git a/googlesearch/asearch.py b/googlesearch/asearch.py new file mode 100644 index 0000000..92762f7 --- /dev/null +++ b/googlesearch/asearch.py @@ -0,0 +1,101 @@ +import asyncio +import httpx +from time import sleep +from bs4 import BeautifulSoup +from urllib.parse import unquote # to decode the url +from .user_agents import get_useragent +from .includes import SearchResult + +async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + async with httpx.AsyncClient(proxy=proxies) as client: + resp = await client.get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + timeout=timeout, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) + resp.raise_for_status() + return resp + +async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): + """Search the Google search engine""" + + # Proxy setup + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None + + start = start_num + fetched_results = 0 # Keep track of the total fetched results + fetched_links = set() # to keep track of links that are already seen previously + + while fetched_results < num_results: + # Send request + resp = await _areq(term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify, region) + + # put in file - comment for debugging purpose + # with open('google.html', 'w') as f: + # f.write(resp.text) + + # Parse + soup = BeautifulSoup(resp.text, "html.parser") + result_block = soup.find_all("div", class_="ezO2md") + new_results = 0 # Keep track of new results in this iteration + + for result in result_block: + # Find the link tag within the result block + link_tag = result.find("a", href=True) + # Find the title tag within the link tag + title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None + # Find the description tag within the result block + description_tag = result.find("span", class_="FrIlee") + + # Check if all necessary tags are found + if link_tag and title_tag and description_tag: + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Check if the link has already been fetched and if unique results are required + if link in fetched_links and unique: + continue # Skip this result if the link is not unique + # Add the link to the set of fetched links + fetched_links.add(link) + # Extract the title text + title = title_tag.text if title_tag else "" + # Extract the description text + description = description_tag.text if description_tag else "" + # Increment the count of fetched results + fetched_results += 1 + # Increment the count of new results in this iteration + new_results += 1 + # Yield the result based on the advanced flag + if advanced: + yield SearchResult(link, title, description) # Yield a SearchResult object + else: + yield link # Yield only the link + + if fetched_results >= num_results: + break # Stop if we have fetched the desired number of results + + if new_results == 0: + #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + break # Break the loop if no new results were found in this iteration + + start += 10 # Prepare for the next set of results + await asyncio.sleep(sleep_interval) + diff --git a/googlesearch/includes.py b/googlesearch/includes.py new file mode 100644 index 0000000..330d696 --- /dev/null +++ b/googlesearch/includes.py @@ -0,0 +1,8 @@ +class SearchResult: + def __init__(self, url, title, description): + self.url = url + self.title = title + self.description = description + + def __repr__(self): + return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" \ No newline at end of file From 39ad7dbfb08efcf12c2e9ded30de389461425b99 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Sat, 13 Sep 2025 21:44:03 +0800 Subject: [PATCH 7/9] ? --- googlesearch/asearch.py | 202 +++++++++++++++++++-------------------- googlesearch/includes.py | 14 +-- 2 files changed, 108 insertions(+), 108 deletions(-) mode change 100644 => 100755 googlesearch/asearch.py mode change 100644 => 100755 googlesearch/includes.py diff --git a/googlesearch/asearch.py b/googlesearch/asearch.py old mode 100644 new mode 100755 index 92762f7..088174a --- a/googlesearch/asearch.py +++ b/googlesearch/asearch.py @@ -1,101 +1,101 @@ -import asyncio -import httpx -from time import sleep -from bs4 import BeautifulSoup -from urllib.parse import unquote # to decode the url -from .user_agents import get_useragent -from .includes import SearchResult - -async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): - async with httpx.AsyncClient(proxy=proxies) as client: - resp = await client.get( - url="https://www.google.com/search", - headers={ - "User-Agent": get_useragent(), - "Accept": "*/*" - }, - params={ - "q": term, - "num": results + 2, # Prevents multiple requests - "hl": lang, - "start": start, - "safe": safe, - "gl": region, - }, - timeout=timeout, - cookies = { - 'CONSENT': 'PENDING+987', # Bypasses the consent page - 'SOCS': 'CAESHAgBEhIaAB', - } - ) - resp.raise_for_status() - return resp - -async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): - """Search the Google search engine""" - - # Proxy setup - proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None - - start = start_num - fetched_results = 0 # Keep track of the total fetched results - fetched_links = set() # to keep track of links that are already seen previously - - while fetched_results < num_results: - # Send request - resp = await _areq(term, num_results - start, - lang, start, proxies, timeout, safe, ssl_verify, region) - - # put in file - comment for debugging purpose - # with open('google.html', 'w') as f: - # f.write(resp.text) - - # Parse - soup = BeautifulSoup(resp.text, "html.parser") - result_block = soup.find_all("div", class_="ezO2md") - new_results = 0 # Keep track of new results in this iteration - - for result in result_block: - # Find the link tag within the result block - link_tag = result.find("a", href=True) - # Find the title tag within the link tag - title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None - # Find the description tag within the result block - description_tag = result.find("span", class_="FrIlee") - - # Check if all necessary tags are found - if link_tag and title_tag and description_tag: - # Extract and decode the link URL - link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" - # Extract and decode the link URL - link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" - # Check if the link has already been fetched and if unique results are required - if link in fetched_links and unique: - continue # Skip this result if the link is not unique - # Add the link to the set of fetched links - fetched_links.add(link) - # Extract the title text - title = title_tag.text if title_tag else "" - # Extract the description text - description = description_tag.text if description_tag else "" - # Increment the count of fetched results - fetched_results += 1 - # Increment the count of new results in this iteration - new_results += 1 - # Yield the result based on the advanced flag - if advanced: - yield SearchResult(link, title, description) # Yield a SearchResult object - else: - yield link # Yield only the link - - if fetched_results >= num_results: - break # Stop if we have fetched the desired number of results - - if new_results == 0: - #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: - #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") - break # Break the loop if no new results were found in this iteration - - start += 10 # Prepare for the next set of results - await asyncio.sleep(sleep_interval) - +import asyncio +import httpx +from time import sleep +from bs4 import BeautifulSoup +from urllib.parse import unquote # to decode the url +from .user_agents import get_useragent +from .includes import SearchResult + +async def _areq(term, results, lang, start, proxies, timeout, safe, ssl_verify, region): + async with httpx.AsyncClient(proxy=proxies) as client: + resp = await client.get( + url="https://www.google.com/search", + headers={ + "User-Agent": get_useragent(), + "Accept": "*/*" + }, + params={ + "q": term, + "num": results + 2, # Prevents multiple requests + "hl": lang, + "start": start, + "safe": safe, + "gl": region, + }, + timeout=timeout, + cookies = { + 'CONSENT': 'PENDING+987', # Bypasses the consent page + 'SOCS': 'CAESHAgBEhIaAB', + } + ) + resp.raise_for_status() + return resp + +async def asearch(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, safe="active", ssl_verify=None, region=None, start_num=0, unique=False): + """Search the Google search engine""" + + # Proxy setup + proxies = proxy if proxy and (proxy.startswith("https") or proxy.startswith("http") or proxy.startswith("socks")) else None + + start = start_num + fetched_results = 0 # Keep track of the total fetched results + fetched_links = set() # to keep track of links that are already seen previously + + while fetched_results < num_results: + # Send request + resp = await _areq(term, num_results - start, + lang, start, proxies, timeout, safe, ssl_verify, region) + + # put in file - comment for debugging purpose + # with open('google.html', 'w') as f: + # f.write(resp.text) + + # Parse + soup = BeautifulSoup(resp.text, "html.parser") + result_block = soup.find_all("div", class_="ezO2md") + new_results = 0 # Keep track of new results in this iteration + + for result in result_block: + # Find the link tag within the result block + link_tag = result.find("a", href=True) + # Find the title tag within the link tag + title_tag = link_tag.find("span", class_="CVA68e") if link_tag else None + # Find the description tag within the result block + description_tag = result.find("span", class_="FrIlee") + + # Check if all necessary tags are found + if link_tag and title_tag and description_tag: + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Extract and decode the link URL + link = unquote(link_tag["href"].split("&")[0].replace("/url?q=", "")) if link_tag else "" + # Check if the link has already been fetched and if unique results are required + if link in fetched_links and unique: + continue # Skip this result if the link is not unique + # Add the link to the set of fetched links + fetched_links.add(link) + # Extract the title text + title = title_tag.text if title_tag else "" + # Extract the description text + description = description_tag.text if description_tag else "" + # Increment the count of fetched results + fetched_results += 1 + # Increment the count of new results in this iteration + new_results += 1 + # Yield the result based on the advanced flag + if advanced: + yield SearchResult(link, title, description) # Yield a SearchResult object + else: + yield link # Yield only the link + + if fetched_results >= num_results: + break # Stop if we have fetched the desired number of results + + if new_results == 0: + #If you want to have printed to your screen that the desired amount of queries can not been fulfilled, uncomment the line below: + #print(f"Only {fetched_results} results found for query requiring {num_results} results. Moving on to the next query.") + break # Break the loop if no new results were found in this iteration + + start += 10 # Prepare for the next set of results + await asyncio.sleep(sleep_interval) + diff --git a/googlesearch/includes.py b/googlesearch/includes.py old mode 100644 new mode 100755 index 330d696..2d325e1 --- a/googlesearch/includes.py +++ b/googlesearch/includes.py @@ -1,8 +1,8 @@ -class SearchResult: - def __init__(self, url, title, description): - self.url = url - self.title = title - self.description = description - - def __repr__(self): +class SearchResult: + def __init__(self, url, title, description): + self.url = url + self.title = title + self.description = description + + def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" \ No newline at end of file From c15a01b461b1e0825810c789109628f2d8afbb89 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Wed, 17 Sep 2025 13:06:06 +0800 Subject: [PATCH 8/9] pname --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 6f4bae5..d08fd6f 100644 --- a/setup.py +++ b/setup.py @@ -7,7 +7,7 @@ requirements = fh.read().split("\n") setup( - name="googlesearch-python", + name="mi-googlesearch-python", version="1.3.0", author="Nishant Vikramaditya", author_email="junk4Nv7@gmail.com", From 18f4e6bb25fab624a866014c7dd5f8c5dd3bebe5 Mon Sep 17 00:00:00 2001 From: edgeinfinity1 Date: Fri, 19 Sep 2025 01:05:30 +0800 Subject: [PATCH 9/9] lynx dropped --- googlesearch/user_agents.py | 18 +++--------------- setup.py | 2 +- 2 files changed, 4 insertions(+), 16 deletions(-) diff --git a/googlesearch/user_agents.py b/googlesearch/user_agents.py index a8bbd4f..fd8304a 100644 --- a/googlesearch/user_agents.py +++ b/googlesearch/user_agents.py @@ -2,19 +2,7 @@ def get_useragent(): """ - Generates a random user agent string mimicking the format of various software versions. - - The user agent string is composed of: - - Lynx version: Lynx/x.y.z where x is 2-3, y is 8-9, and z is 0-2 - - libwww version: libwww-FM/x.y where x is 2-3 and y is 13-15 - - SSL-MM version: SSL-MM/x.y where x is 1-2 and y is 3-5 - - OpenSSL version: OpenSSL/x.y.z where x is 1-3, y is 0-4, and z is 0-9 - - Returns: - str: A randomly generated user agent string. + Lynx was deprecated 2025.9, so we using something else. """ - lynx_version = f"Lynx/{random.randint(2, 3)}.{random.randint(8, 9)}.{random.randint(0, 2)}" - libwww_version = f"libwww-FM/{random.randint(2, 3)}.{random.randint(13, 15)}" - ssl_mm_version = f"SSL-MM/{random.randint(1, 2)}.{random.randint(3, 5)}" - openssl_version = f"OpenSSL/{random.randint(1, 3)}.{random.randint(0, 4)}.{random.randint(0, 9)}" - return f"{lynx_version} {libwww_version} {ssl_mm_version} {openssl_version}" \ No newline at end of file + + return f"AdsBot-Google (+http://www.google.com/adsbot.html)" \ No newline at end of file diff --git a/setup.py b/setup.py index d08fd6f..337a279 100644 --- a/setup.py +++ b/setup.py @@ -8,7 +8,7 @@ setup( name="mi-googlesearch-python", - version="1.3.0", + version="1.3.0.post1", author="Nishant Vikramaditya", author_email="junk4Nv7@gmail.com", description="A Python library for scraping the Google search engine.",