Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added post method, as get was failing on me. #57

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 8 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,13 +7,20 @@ To install, run the following command:
python3 -m pip install googlesearch-python
```

## Usage
## Usage via GET
To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program:
```python
from googlesearch import search
search("Google")
```
## Usage via POST
To get results for a search term you can also user the POST method, which i've found can have better results, and is more reliable. For example, to get results for "Google" in Google, just run the following program:
Currently only basic functionality. See docstring for more.
```python
from googlesearch import search_post
search_post("Google")

```
## Additional options
googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program.
```python
Expand Down
126 changes: 119 additions & 7 deletions googlesearch/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,57 @@
"""googlesearch is a Python library for searching Google, easily."""
import gzip
import re
import zlib
from time import sleep
from urllib.parse import quote_plus

import brotli
import requests
from bs4 import BeautifulSoup
from requests import get
from .user_agents import get_useragent
import urllib
from .user_agents import _get_useragent, get_random_header

def _req_post(term, results=10, lang="en", proxies=None, timeout=10):

"""
Sends a request to Google Search and returns the response.

Attributes:
term (str): The term to search for.
results (int): The number of results to return.
lang (str): The language to search in.
proxies (dict): A dictionary of proxies to use.
timeout (int): The timeout for the request.

"""
# Get random header
header = get_random_header()

data = {
'bl': 'boq_identityfrontenduiserver_20230625.09_p0',
'x': '8',
'gl': 'GB',
'm': '0',
'app': '0',
'pc': 'srp',
'continue': f'https://www.google.com/search?q={term}&hl={lang}&num={results}&start=0&gbv=1&sei=qrCcZOfUH5DskdUPib21oA4',
'hl': 'en',
'uxe': 'none',
'set_eom': 'false',
'set_sc': 'true',
'set_aps': 'true',
}
response = requests.post('https://consent.google.com/save', data=data, headers=header, proxies=proxies, timeout=timeout)
response.raise_for_status()

return response


def _req(term, results, lang, start, proxies, timeout):
resp = get(
url="https://www.google.com/search",
headers={
"User-Agent": get_useragent()
"User-Agent": _get_useragent()
},
params={
"q": term,
Expand All @@ -34,11 +75,80 @@ def __init__(self, url, title, description):
def __repr__(self):
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"

def decode_content(response):

content = None

try:
content = response.content
encoding = response.headers.get('content-encoding', '').lower()
charsets = response.headers.get('content-type', '').lower()

# Apply decoding for multiple content codings
for coding in reversed(encoding.split(',')):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Does requests not already do this?

if coding.strip() == 'gzip':
content = gzip.decompress(content)
elif coding.strip() == 'deflate':
content = zlib.decompress(content)
elif coding.strip() == 'compress':
content = zlib.decompress(content, -zlib.MAX_WBITS)
elif coding.strip() == 'br':
content = brotli.decompress(content)
else:
pass # unknown coding, ignore it

# Determine the charset
for charset in charsets.split(';'):
if charset.strip().startswith('charset='):
return content.decode(charset.split('=')[1])
return content.decode('utf-8') # fallback to utf-8

except Exception as e:
return content.decode("utf-8")

def search_post(term, num_results=10, lang="en", sleep_interval=0, proxies=None, timeout=10, attempts=5):
"""
Search the Google search engine, but bypass the JS issue by posting the request instead of get.
Returns a list of urls.

Attributes:
term (str): The term to search for.
num_results (int): The number of results to return.
lang (str): The language to search for.
sleep_interval (int): The time to sleep between requests.
proxies (dict): A dictionary of proxies to use.

def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5):
attempts (int): The number of attempts to make before giving up.

"""

escaped_term = quote_plus(term) # make 'site:xxx.xxx.xxx ' works.

# Proxy

# Fetch
tries = 0
while tries < attempts:

# Post and get response
resp = _req_post(escaped_term, num_results, lang, proxies, timeout)
# Decode content if needed
decoded_content = decode_content(resp)
# use regex to find all urls
results = re.findall(r"/url\?q=([^&]+)", decoded_content)

if len(results) > 0:
return results

sleep(sleep_interval)
tries += 1

return []

def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5, post=False):
"""Search the Google search engine"""

escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works.
escaped_term = quote_plus(term) # make 'site:xxx.xxx.xxx ' works.

# Proxy
proxies = None
Expand All @@ -52,8 +162,10 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in
start = 0
while start < num_results:
# Send request
resp = _req(escaped_term, num_results - start,
lang, start, proxies, timeout)
if post:
resp = _req_post(escaped_term, num_results - start, lang, start, proxies, timeout)
else:
resp = _req(escaped_term, num_results - start, lang, start, proxies, timeout)

# Parse
soup = BeautifulSoup(resp.text, "html.parser")
Expand Down
207 changes: 202 additions & 5 deletions googlesearch/user_agents.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,213 @@
import random
import secrets

def _get_useragent():
"""
Returns a random user agent

def get_useragent():
return random.choice(_useragent_list)
Returns:
str: Random user agent


_useragent_list = [
"""
return random.choice([
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:66.0) Gecko/20100101 Firefox/66.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.62',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0'
]
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.275',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 OPR/77.0.4054.277',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; WOW64; Trident/7.0; rv:11.0) like Gecko',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54',
'Mozilla/5.0 (Windows NT 6.1; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.1; rv:88.0) Gecko/20100101 Firefox/88.0',
'Mozilla/5.0 (Windows NT 10.0; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 10.0; rv:88.0) Gecko/20100101 Firefox/88.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 Edg/91.0.864.54',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 11_4_0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1 Safari/605.1.15',
'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (X11; Linux x86_64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36 OPR/77.0.4054.275',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0 OPR/77.0.4054.277',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:89.0) Gecko/20100101 Firefox/89.0',
'Mozilla/5.0 (Windows NT 6.1; Trident/7.0; rv:11.0) like Gecko'])

def _get_referer():
"""
Returns a random referer

Returns:
str: Random referer

"""
return random.choice(["https://www.google.com",
"https://www.bing.com",
"https://www.yahoo.com",
"https://www.duckduckgo.com",
"https://www.facebook.com",
"https://www.twitter.com",
"https://www.instagram.com",
"https://www.linkedin.com",
"https://www.reddit.com",
"https://www.stackoverflow.com",
"https://www.amazon.com",
"https://www.ebay.com",
"https://www.netflix.com",
"https://www.youtube.com",
"https://www.twitch.tv",
"https://www.microsoft.com",
"https://www.apple.com",
"https://www.github.com",
"https://www.wikipedia.org", None, None, None, None, None, None, None, None, None, None, None,
None, None, None, None, None, None, None, None])


def _get_cookies():
"""
Returns a random cookie string

Returns:
str: Random cookie string

"""
no = random.randint(2, 5)
secrets.token_urlsafe(random.randint(8, 15))
choices = random.choices([f"_ga=GA1.3.{random.randint(100000, 999999)}.{random.randint(100000, 999999)};",
f"_ga=GA1.2.{random.randint(100000, 999999)}.{random.randint(100000, 999999)};",
"_gat=1;",
f"__utma={random.randint(100000, 999999)}.{random.randint(100000, 999999)}.{random.randint(100000, 999999)}.1;",
f"__utmb={random.randint(100000, 999999)}.1.10.{random.randint(100000, 999999)};",
f"__utmc={random.randint(100000, 999999)};",
f"__utmz={random.randint(100000, 999999)}.{random.randint(100000, 999999)}.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none);",
f"__utmt={random.randint(1, 6)};",
f"session={secrets.token_urlsafe(random.randint(8, 15))};",
f"user_session={secrets.token_urlsafe(random.randint(8, 15))};",
f"remember_user_token={secrets.token_urlsafe(random.randint(2, 5))}.1234-5678;",
f"_csrf_token={secrets.token_urlsafe(random.randint(5, 10))};",
f"JSESSIONID={secrets.token_urlsafe(random.randint(6, 9))};",
f"login={secrets.token_urlsafe(random.randint(8, 15))}",
f"username={randomname._get_name};",
f"__RequestVerificationToken={secrets.token_urlsafe(random.randint(8, 15))};",
f"ASP.NET_SessionId={secrets.token_urlsafe(random.randint(8, 15))};",
f".AspNet.ApplicationCookie={secrets.token_urlsafe(random.randint(8, 15))};",
f"AWSALB={secrets.token_urlsafe(random.randint(8, 15))};",
f"AWSALBCORS={secrets.token_urlsafe(random.randint(8, 15))};"], k=no)
return ",".join(choices)


def _get_accept():

"""
Returns a random accept string

Returns:

"""
return random.choice(["text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"text/html,application/xhtml+xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp;q=0.8,*/*;q=0.7",
"text/html,application/xhtml+xml;q=0.9,image/apng,image/*,*/*;q=0.8",
"text/html,application/xhtml+xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/xml;q=0.7",
"text/html,application/xhtml+xml;q=0.8,image/webp,image/apng,*/*;q=0.7,application/xml;q=0.6",
"text/html,application/xhtml+xml;q=0.8,image/apng,image/*,*/*;q=0.7,application/xml;q=0.6",
"text/html,application/xhtml+xml;q=0.7,image/webp,image/apng,*/*;q=0.8,application/xml;q=0.6",
"text/html,application/xhtml+xml;q=0.7,image/webp;q=0.8,image/apng,*/*;q=0.7,application/xml;q=0.6",
"text/html,application/xhtml+xml;q=0.6,image/webp,image/apng,*/*;q=0.8,application/xml;q=0.7"])


def _get_language():

"""
Returns a random language string

Returns:
str: Random language string
"""
return random.choice(["en-GB,en;q=0.9",
"en-GB,en;q=0.8,fr;q=0.7,de;q=0.6,es;q=0.5",
"en-GB,en-US;q=0.9,en;q=0.8",
"en-GB;q=0.9,en;q=0.8,es;q=0.7",
"en-GB,en;q=0.8,fr;q=0.7",
"en-GB;q=0.9,en;q=0.8,fr;q=0.7,de;q=0.6",
"en-GB;q=1.0,en;q=0.9,fr;q=0.8,de;q=0.7,es;q=0.6",
"en-GB;q=1.0,en;q=0.9,es;q=0.8",
"en-GB;q=1.0,en-US;q=0.9,en;q=0.8,fr;q=0.7,de;q=0.6",
"en-GB;q=1.0,en;q=0.9,es;q=0.8,fr;q=0.7"])


def _get_enconding():
"""
Returns a random encoding string

Returns:
str: Random encoding string
"""
return random.choice(["gzip, deflate, br, *",
"br, gzip, *",
"gzip, deflate, *",
"*"
"br, *",
"compress, gzip, *",
"gzip, *",
"deflate, br, *",
"gzip, deflate, br, identity, *"])


def get_random_header():
"""
Returns a random header for a request
"""
# Get random user agent, referer, cookies, accept, language and encoding
user_agent = _get_useragent()
referer = _get_referer()
cookies = _get_cookies()
accept = _get_accept()
language = _get_language()
encoding = _get_enconding()
# Create header
header = {"User-Agent": user_agent,
"Accept": accept,
"Accept-Language": language,
"Accept-Encoding": encoding,
"Cookies": cookies}
# Add referer if it exists
if referer:
header["Referer"] = referer

return header
Loading