From aebd137729a6975c50592157ec6be95f757e0510 Mon Sep 17 00:00:00 2001 From: Engjell Avdiu Date: Mon, 30 Oct 2023 20:28:22 +0100 Subject: [PATCH 1/4] Add support for fetching sponsored Google search results This commit introduces the ability to fetch sponsored search results from Google. - Extended the `SearchResult` class to include a `is_sponsored` boolean field. - Modified the `search` function to accept a new parameter `sponsored` that toggles whether or not to include sponsored results. - Added additional parsing logic to identify and include sponsored results in the output. Note: Ensure compliance with Google's terms of service when using this feature. --- googlesearch/__init__.py | 54 +++++++++++++++++++++++++++++----------- 1 file changed, 39 insertions(+), 15 deletions(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 74e6564..4ea8eaa 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -14,30 +14,31 @@ def _req(term, results, lang, start, proxies, timeout): }, params={ "q": term, - "num": results + 2, # Prevents multiple requests + "num": results, "hl": lang, "start": start, }, proxies=proxies, timeout=timeout, ) + resp.raise_for_status() return resp - class SearchResult: - def __init__(self, url, title, description): + def __init__(self, url, title, description, is_sponsored): self.url = url self.title = title self.description = description + self.is_sponsored = is_sponsored def __repr__(self): - return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" - - -def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): - """Search the Google search engine""" - + if self.is_sponsored: + return f"SearchResult(url={self.url}, title={self.title}, description={self.description}, is_sponsored={self.is_sponsored})" + else: + return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" + +def search(term, sponsored=False, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works. # Proxy @@ -52,29 +53,52 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in start = 0 while start < num_results: # Send request - resp = _req(escaped_term, num_results - start, + resp = _req(escaped_term,num_results - start, lang, start, proxies, timeout) # Parse soup = BeautifulSoup(resp.text, "html.parser") + + # Check for sponsored results + if sponsored: + sponsored_block = soup.find_all("div", attrs={"class": "vdQmEd"}) + if len(sponsored_block) == 0: + start += 1 + for sponsored_result in sponsored_block: + link = sponsored_result.find("a", href=True,attrs={"class":"sVXRqc"}).get("href") + title = sponsored_result.find("span", attrs={"class":"OSrXXb"}) + description_box = sponsored_result.find(lambda tag: tag.name == 'span' and not tag.has_attr('class')) + + if description_box: + description = description_box.text + if link and title and description: + start += 1 + if advanced: + yield SearchResult(link, title.text, description, True) + else: + yield link + + # Check for not sponsored results result_block = soup.find_all("div", attrs={"class": "g"}) - if len(result_block) ==0: + if len(result_block) == 0: start += 1 for result in result_block: # Find link, title, description - link = result.find("a", href=True) + link = result.find("a", href=True).get("href") title = result.find("h3") description_box = result.find( "div", {"style": "-webkit-line-clamp:2"}) + if description_box: description = description_box.text if link and title and description: start += 1 if advanced: - yield SearchResult(link["href"], title.text, description) + yield SearchResult(link, title.text, description, False) else: - yield link["href"] + yield link + sleep(sleep_interval) if start == 0: - return [] + return [] \ No newline at end of file From fce19265a963a7f0695ee89cb9a4e66baa857a3d Mon Sep 17 00:00:00 2001 From: Engjell Avdiu Date: Mon, 30 Oct 2023 20:30:17 +0100 Subject: [PATCH 2/4] Add to_df function to convert search results to DataFrame This commit adds a utility function `to_df` that converts the search results to a Pandas DataFrame for easier manipulation and analysis. - Created `to_df` function that takes an iterable of `SearchResult` objects. - The function enumerates through the search results, extracting `url`, `title`, and `description`, and optionally `is_sponsored` if present. - Returns a Pandas DataFrame containing these details. This feature enhances data manipulation capabilities, making it easier to process and analyze search results. --- googlesearch/__init__.py | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py index 4ea8eaa..500112d 100644 --- a/googlesearch/__init__.py +++ b/googlesearch/__init__.py @@ -4,7 +4,7 @@ from requests import get from .user_agents import get_useragent import urllib - +import pandas as pd def _req(term, results, lang, start, proxies, timeout): resp = get( @@ -25,6 +25,23 @@ def _req(term, results, lang, start, proxies, timeout): resp.raise_for_status() return resp +def to_df(search_results): + data = [] + for index, result in enumerate(search_results): + result_data = { + 'index': index + 1, + 'url': result.url, + 'title': result.title, + 'description': result.description + } + if hasattr(result, 'is_sponsored'): + result_data['is_sponsored'] = result.is_sponsored + data.append(result_data) + + df = pd.DataFrame(data) + + return df + class SearchResult: def __init__(self, url, title, description, is_sponsored): self.url = url From 90076a88caf0a65d4ae304e0e125e3a6f9edd605 Mon Sep 17 00:00:00 2001 From: Engjell Avdiu Date: Mon, 30 Oct 2023 20:35:53 +0100 Subject: [PATCH 3/4] Update README to include new features and usage This commit updates the README to reflect the recent changes and new features added to the library: - Added section on fetching sponsored results using the `sponsored` parameter. - Included information about the `to_df` function for converting search results to a Pandas DataFrame. The update aims to provide users with a comprehensive guide to using the latest version of the library. --- README.md | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index e43f1bc..2333671 100644 --- a/README.md +++ b/README.md @@ -1,13 +1,17 @@ # googlesearch -googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. + +googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. ## Installation + To install, run the following command: + ```bash python3 -m pip install googlesearch-python ``` ## Usage + To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program: ```python from googlesearch import search @@ -15,17 +19,23 @@ search("Google") ``` ## Additional options + googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program. + ```python from googlesearch import search search("Google", num_results=100) ``` + In addition, you can change the language google searches in. For example, to get results in French run the following program: + ```python from googlesearch import search search("Google", lang="fr") ``` + To extract more information, such as the description or the result URL, use an advanced search: + ```python from googlesearch import search search("Google", advanced=True) @@ -35,8 +45,30 @@ search("Google", advanced=True) # - url # - description ``` + +To also fetch sponsored results along with general search results, use the sponsored parameter: + +```python +from googlesearch import search +search("Google", sponsored=True) +# Returns a list of SearchResult +# Properties: +# - title +# - url +# - description +# - sponsored +``` + +To convert the search results to a Pandas DataFrame, use the to_df function: + +```python +from googlesearch import to_df +df = to_df(advanced_results) +``` + If requesting more than 100 results, googlesearch will send multiple requests to go through the pages. To increase the time between these requests, use `sleep_interval`: + ```python from googlesearch import search search("Google", sleep_interval=5, num_results=200) -``` \ No newline at end of file +``` From 6b20c0191d0ec280f6385d90dfbbbbbf2b72f6ed Mon Sep 17 00:00:00 2001 From: Engjell Avdiu Date: Thu, 9 Nov 2023 19:51:56 +0100 Subject: [PATCH 4/4] add pandas to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 56399db..db44518 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ beautifulsoup4>=4.9 requests>=2.20 +pandas