Nv7-GitHub · engjellavdiu · Oct 30, 2023 · Oct 30, 2023 · Oct 30, 2023 · Nov 9, 2023
diff --git a/README.md b/README.md
@@ -1,31 +1,41 @@
 # googlesearch
-googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google. 
+
+googlesearch is a Python library for searching Google, easily. googlesearch uses requests and BeautifulSoup4 to scrape Google.
 
 ## Installation
+
 To install, run the following command:
+
 ```bash
 python3 -m pip install googlesearch-python
 ```
 
 ## Usage
+
 To get results for a search term, simply use the search function in googlesearch. For example, to get results for "Google" in Google, just run the following program:
 ```python
 from googlesearch import search
 search("Google")
 ```
 
 ## Additional options
+
 googlesearch supports a few additional options. By default, googlesearch returns 10 results. This can be changed. To get a 100 results on Google for example, run the following program.
+
 ```python
 from googlesearch import search
 search("Google", num_results=100)
 ```
+
 In addition, you can change the language google searches in. For example, to get results in French run the following program:
+
 ```python
 from googlesearch import search
 search("Google", lang="fr")
 ```
+
 To extract more information, such as the description or the result URL, use an advanced search:
+
 ```python
 from googlesearch import search
 search("Google", advanced=True)
@@ -35,8 +45,30 @@ search("Google", advanced=True)
 # - url
 # - description
 ```
+
+To also fetch sponsored results along with general search results, use the sponsored parameter:
+
+```python
+from googlesearch import search
+search("Google", sponsored=True)
+# Returns a list of SearchResult
+# Properties:
+# - title
+# - url
+# - description
+# - sponsored
+```
+
+To convert the search results to a Pandas DataFrame, use the to_df function:
+
+```python
+from googlesearch import to_df
+df = to_df(advanced_results)
+```
+
 If requesting more than 100 results, googlesearch will send multiple requests to go through the pages. To increase the time between these requests, use `sleep_interval`:
+
 ```python
 from googlesearch import search
 search("Google", sleep_interval=5, num_results=200)
-```
+```
diff --git a/googlesearch/__init__.py b/googlesearch/__init__.py
@@ -4,7 +4,7 @@
 from requests import get
 from .user_agents import get_useragent
 import urllib
-
+import pandas as pd
 
 def _req(term, results, lang, start, proxies, timeout):
     resp = get(
@@ -14,30 +14,48 @@ def _req(term, results, lang, start, proxies, timeout):
         },
         params={
             "q": term,
-            "num": results + 2,  # Prevents multiple requests
+            "num": results,
             "hl": lang,
             "start": start,
         },
         proxies=proxies,
         timeout=timeout,
     )
+
     resp.raise_for_status()
     return resp
 
+def to_df(search_results):
+    data = []
+    for index, result in enumerate(search_results):
+        result_data = {
+            'index': index + 1,
+            'url': result.url,
+            'title': result.title,
+            'description': result.description
+        }
+        if hasattr(result, 'is_sponsored'):
+            result_data['is_sponsored'] = result.is_sponsored
+        data.append(result_data)
+
+    df = pd.DataFrame(data)
+
+    return df
 
 class SearchResult:
-    def __init__(self, url, title, description):
+    def __init__(self, url, title, description, is_sponsored):
         self.url = url
         self.title = title
         self.description = description
+        self.is_sponsored = is_sponsored
 
     def __repr__(self):
-        return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
-
-
-def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5):
-    """Search the Google search engine"""
-
+        if self.is_sponsored:
+            return f"SearchResult(url={self.url}, title={self.title}, description={self.description}, is_sponsored={self.is_sponsored})"
+        else:
+            return f"SearchResult(url={self.url}, title={self.title}, description={self.description})"
+
+def search(term, sponsored=False, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5):
     escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works.
 
     # Proxy
@@ -52,29 +70,52 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in
     start = 0
     while start < num_results:
         # Send request
-        resp = _req(escaped_term, num_results - start,
+        resp = _req(escaped_term,num_results - start,
                     lang, start, proxies, timeout)
 
         # Parse
         soup = BeautifulSoup(resp.text, "html.parser")
+
+        # Check for sponsored results
+        if sponsored:
+            sponsored_block = soup.find_all("div", attrs={"class": "vdQmEd"})
+            if len(sponsored_block) == 0:
+                start += 1
+            for sponsored_result in sponsored_block:
+                link = sponsored_result.find("a", href=True,attrs={"class":"sVXRqc"}).get("href")
+                title = sponsored_result.find("span", attrs={"class":"OSrXXb"})
+                description_box = sponsored_result.find(lambda tag: tag.name == 'span' and not tag.has_attr('class'))
+
+                if description_box:
+                    description = description_box.text
+                    if link and title and description:
+                        start += 1
+                        if advanced:
+                            yield SearchResult(link, title.text, description, True)
+                        else: 
+                            yield link
+
+        # Check for not sponsored results           
         result_block = soup.find_all("div", attrs={"class": "g"})
-        if len(result_block) ==0:
+        if len(result_block) == 0:
             start += 1
         for result in result_block:
             # Find link, title, description
-            link = result.find("a", href=True)
+            link = result.find("a", href=True).get("href")
             title = result.find("h3")
             description_box = result.find(
                 "div", {"style": "-webkit-line-clamp:2"})
+
             if description_box:
                 description = description_box.text
                 if link and title and description:
                     start += 1
                     if advanced:
-                        yield SearchResult(link["href"], title.text, description)
+                        yield SearchResult(link, title.text, description, False)
                     else:
-                        yield link["href"]
+                        yield link
+
         sleep(sleep_interval)
 
         if start == 0:
-            return []
+            return []
diff --git a/requirements.txt b/requirements.txt
@@ -1,2 +1,3 @@
 beautifulsoup4>=4.9
 requests>=2.20
+pandas