From 6a84d87ce05f2e6d37e20b7d2cde71d7db778160 Mon Sep 17 00:00:00 2001 From: lewis Date: Thu, 29 Jun 2023 01:13:31 +0100 Subject: [PATCH] added post method, as get was failing. Also did away with yeild, and added random header. --- README.md | 9 +- googlesearch/__init__.py | 126 ++++++++++++++++++++-- googlesearch/user_agents.py | 207 +++++++++++++++++++++++++++++++++++- requirements.txt | 6 ++ 4 files changed, 335 insertions(+), 13 deletions(-) diff --git a/README.md b/README.md index e43f1bc..3858346 100644 --- a/README.md +++ b/README.md @@ -7,13 +7,20 @@ To install, run the following command: python3 -m pip install googlesearch-python ``` -## Usage +## Usage via GET To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program: ```python from googlesearch import search search("Google") ``` +## Usage via POST +To get results for a search term you can also user the POST method, which i've found can have better results, and is more reliable. For example, to get results for "Google" in Google, just run the following program: +Currently only basic functionality. See docstring for more. +```python +from googlesearch import search_post +search_post("Google") +``` ## Additional options googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program. ```python diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 74e6564..cfafb4c 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -1,16 +1,57 @@ """googlesearch is a Python library for searching Google, easily.""" +import gzip +import re +import zlib from time import sleep +from urllib.parse import quote_plus + +import brotli +import requests from bs4 import BeautifulSoup from requests import get -from .user_agents import get_useragent -import urllib +from .user_agents import _get_useragent, get_random_header + +def _req_post(term, results=10, lang="en", proxies=None, timeout=10): + + """ + Sends a request to Google Search and returns the response. + + Attributes: + term (str): The term to search for. + results (int): The number of results to return. + lang (str): The language to search in. + proxies (dict): A dictionary of proxies to use. + timeout (int): The timeout for the request. + + """ + # Get random header + header = get_random_header() + + data = { + 'bl': 'boq_identityfrontenduiserver_20230625.09_p0', + 'x': '8', + 'gl': 'GB', + 'm': '0', + 'app': '0', + 'pc': 'srp', + 'continue': f'https://www.google.com/search?q={term}&hl={lang}&num={results}&start=0&gbv=1&sei=qrCcZOfUH5DskdUPib21oA4', + 'hl': 'en', + 'uxe': 'none', + 'set_eom': 'false', + 'set_sc': 'true', + 'set_aps': 'true', + } + response = requests.post('https://consent.google.com/save', data=data, headers=header, proxies=proxies, timeout=timeout) + response.raise_for_status() + + return response def _req(term, results, lang, start, proxies, timeout): resp = get( url="https://www.google.com/search", headers={ - "User-Agent": get_useragent() + "User-Agent": _get_useragent() }, params={ "q": term, @@ -34,11 +75,80 @@ def __init__(self, url, title, description): def __repr__(self): return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" +def decode_content(response): + + content = None + + try: + content = response.content + encoding = response.headers.get('content-encoding', '').lower() + charsets = response.headers.get('content-type', '').lower() + + # Apply decoding for multiple content codings + for coding in reversed(encoding.split(',')): + if coding.strip() == 'gzip': + content = gzip.decompress(content) + elif coding.strip() == 'deflate': + content = zlib.decompress(content) + elif coding.strip() == 'compress': + content = zlib.decompress(content, -zlib.MAX_WBITS) + elif coding.strip() == 'br': + content = brotli.decompress(content) + else: + pass # unknown coding, ignore it + + # Determine the charset + for charset in charsets.split(';'): + if charset.strip().startswith('charset='): + return content.decode(charset.split('=')[1]) + return content.decode('utf-8') # fallback to utf-8 + + except Exception as e: + return content.decode("utf-8") + +def search_post(term, num_results=10, lang="en", sleep_interval=0, proxies=None, timeout=10, attempts=5): + """ + Search the Google search engine, but bypass the JS issue by posting the request instead of get. + Returns a list of urls. + + Attributes: + term (str): The term to search for. + num_results (int): The number of results to return. + lang (str): The language to search for. + sleep_interval (int): The time to sleep between requests. + proxies (dict): A dictionary of proxies to use. -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): + attempts (int): The number of attempts to make before giving up. + + """ + + escaped_term = quote_plus(term) # make 'site:xxx.xxx.xxx ' works. + + # Proxy + + # Fetch + tries = 0 + while tries < attempts: + + # Post and get response + resp = _req_post(escaped_term, num_results, lang, proxies, timeout) + # Decode content if needed + decoded_content = decode_content(resp) + # use regex to find all urls + results = re.findall(r"/url\?q=([^&]+)", decoded_content) + + if len(results) > 0: + return results + + sleep(sleep_interval) + tries += 1 + + return [] + +def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, post=False): """Search the Google search engine""" - escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works. + escaped_term = quote_plus(term) # make 'site:xxx.xxx.xxx ' works. # Proxy proxies = None @@ -52,8 +162,10 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in start = 0 while start < num_results: # Send request - resp = _req(escaped_term, num_results - start, - lang, start, proxies, timeout) + if post: + resp = _req_post(escaped_term, num_results - start, lang, start, proxies, timeout) + else: + resp = _req(escaped_term, num_results - start, lang, start, proxies, timeout) # Parse soup = BeautifulSoup(resp.text, "html.parser") diff --git a/googlesearch/user_agents.py b/googlesearch/user_agents.py index 80206fe..d55a9fd 100644 --- a/googlesearch/user_agents.py +++ b/googlesearch/user_agents.py @@ -1,11 +1,15 @@ import random +import secrets +def _get_useragent(): + """ + Returns a random user agent -def get_useragent(): - return random.choice(_useragent_list) + Returns: + str: Random user agent - -_useragent_list = [ + """ + return random.choice([ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', @@ -13,4 +17,197 @@ def get_useragent(): 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0' -] + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.275', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 OPR/77.0.4054.277', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54', + 'Mozilla/5.0 (Windows NT 6.1; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.1; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Mozilla/5.0 (Windows NT 10.0; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 10.0; rv:88.0) Gecko/20100101 Firefox/88.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15', + 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.275', + 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 OPR/77.0.4054.277', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', + 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0', + 'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko']) + +def _get_referer(): + """ + Returns a random referer + + Returns: + str: Random referer + + """ + return random.choice(["https://www.google.com", + "https://www.bing.com", + "https://www.yahoo.com", + "https://www.duckduckgo.com", + "https://www.facebook.com", + "https://www.twitter.com", + "https://www.instagram.com", + "https://www.linkedin.com", + "https://www.reddit.com", + "https://www.stackoverflow.com", + "https://www.amazon.com", + "https://www.ebay.com", + "https://www.netflix.com", + "https://www.youtube.com", + "https://www.twitch.tv", + "https://www.microsoft.com", + "https://www.apple.com", + "https://www.github.com", + "https://www.wikipedia.org", None, None, None, None, None, None, None, None, None, None, None, + None, None, None, None, None, None, None, None]) + + +def _get_cookies(): + """ + Returns a random cookie string + + Returns: + str: Random cookie string + + """ + no = random.randint(2, 5) + secrets.token_urlsafe(random.randint(8, 15)) + choices = random.choices([f"_ga=GA1.3.{random.randint(100000, 999999)}.{random.randint(100000, 999999)};", + f"_ga=GA1.2.{random.randint(100000, 999999)}.{random.randint(100000, 999999)};", + "_gat=1;", + f"__utma={random.randint(100000, 999999)}.{random.randint(100000, 999999)}.{random.randint(100000, 999999)}.1;", + f"__utmb={random.randint(100000, 999999)}.1.10.{random.randint(100000, 999999)};", + f"__utmc={random.randint(100000, 999999)};", + f"__utmz={random.randint(100000, 999999)}.{random.randint(100000, 999999)}.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none);", + f"__utmt={random.randint(1, 6)};", + f"session={secrets.token_urlsafe(random.randint(8, 15))};", + f"user_session={secrets.token_urlsafe(random.randint(8, 15))};", + f"remember_user_token={secrets.token_urlsafe(random.randint(2, 5))}.1234-5678;", + f"_csrf_token={secrets.token_urlsafe(random.randint(5, 10))};", + f"JSESSIONID={secrets.token_urlsafe(random.randint(6, 9))};", + f"login={secrets.token_urlsafe(random.randint(8, 15))}", + f"username={randomname._get_name};", + f"__RequestVerificationToken={secrets.token_urlsafe(random.randint(8, 15))};", + f"ASP.NET_SessionId={secrets.token_urlsafe(random.randint(8, 15))};", + f".AspNet.ApplicationCookie={secrets.token_urlsafe(random.randint(8, 15))};", + f"AWSALB={secrets.token_urlsafe(random.randint(8, 15))};", + f"AWSALBCORS={secrets.token_urlsafe(random.randint(8, 15))};"], k=no) + return ",".join(choices) + + +def _get_accept(): + + """ + Returns a random accept string + + Returns: + + """ + return random.choice(["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", + "text/html,application/xhtml+xml;q=0.9,image/webp,image/apng,*/*;q=0.8", + "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp;q=0.8,*/*;q=0.7", + "text/html,application/xhtml+xml;q=0.9,image/apng,image/*,*/*;q=0.8", + "text/html,application/xhtml+xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/xml;q=0.7", + "text/html,application/xhtml+xml;q=0.8,image/webp,image/apng,*/*;q=0.7,application/xml;q=0.6", + "text/html,application/xhtml+xml;q=0.8,image/apng,image/*,*/*;q=0.7,application/xml;q=0.6", + "text/html,application/xhtml+xml;q=0.7,image/webp,image/apng,*/*;q=0.8,application/xml;q=0.6", + "text/html,application/xhtml+xml;q=0.7,image/webp;q=0.8,image/apng,*/*;q=0.7,application/xml;q=0.6", + "text/html,application/xhtml+xml;q=0.6,image/webp,image/apng,*/*;q=0.8,application/xml;q=0.7"]) + + +def _get_language(): + + """ + Returns a random language string + + Returns: + str: Random language string + """ + return random.choice(["en-GB,en;q=0.9", + "en-GB,en;q=0.8,fr;q=0.7,de;q=0.6,es;q=0.5", + "en-GB,en-US;q=0.9,en;q=0.8", + "en-GB;q=0.9,en;q=0.8,es;q=0.7", + "en-GB,en;q=0.8,fr;q=0.7", + "en-GB;q=0.9,en;q=0.8,fr;q=0.7,de;q=0.6", + "en-GB;q=1.0,en;q=0.9,fr;q=0.8,de;q=0.7,es;q=0.6", + "en-GB;q=1.0,en;q=0.9,es;q=0.8", + "en-GB;q=1.0,en-US;q=0.9,en;q=0.8,fr;q=0.7,de;q=0.6", + "en-GB;q=1.0,en;q=0.9,es;q=0.8,fr;q=0.7"]) + + +def _get_enconding(): + """ + Returns a random encoding string + + Returns: + str: Random encoding string + """ + return random.choice(["gzip, deflate, br, *", + "br, gzip, *", + "gzip, deflate, *", + "*" + "br, *", + "compress, gzip, *", + "gzip, *", + "deflate, br, *", + "gzip, deflate, br, identity, *"]) + + +def get_random_header(): + """ + Returns a random header for a request + """ + # Get random user agent, referer, cookies, accept, language and encoding + user_agent = _get_useragent() + referer = _get_referer() + cookies = _get_cookies() + accept = _get_accept() + language = _get_language() + encoding = _get_enconding() + # Create header + header = {"User-Agent": user_agent, + "Accept": accept, + "Accept-Language": language, + "Accept-Encoding": encoding, + "Cookies": cookies} + # Add referer if it exists + if referer: + header["Referer"] = referer + + return header \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index 56399db..3e30806 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,8 @@ beautifulsoup4>=4.9 requests>=2.20 +setuptools~=67.8.0 +zlib~=1.2.13 +brotli~=1.0.9 +cbrotlipy==0.7.0 +bzip2~=1.0.8 +urllib3~=1.26.7 \ No newline at end of file