-
Notifications
You must be signed in to change notification settings - Fork 139
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add sponsored search results and Pandas DataFrame conversion #69
base: master
Are you sure you want to change the base?
Changes from all commits
aebd137
fce1926
90076a8
6b20c01
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,7 +4,7 @@ | |
from requests import get | ||
from .user_agents import get_useragent | ||
import urllib | ||
|
||
import pandas as pd | ||
|
||
def _req(term, results, lang, start, proxies, timeout): | ||
resp = get( | ||
|
@@ -14,30 +14,48 @@ def _req(term, results, lang, start, proxies, timeout): | |
}, | ||
params={ | ||
"q": term, | ||
"num": results + 2, # Prevents multiple requests | ||
"num": results, | ||
"hl": lang, | ||
"start": start, | ||
}, | ||
proxies=proxies, | ||
timeout=timeout, | ||
) | ||
|
||
resp.raise_for_status() | ||
return resp | ||
|
||
def to_df(search_results): | ||
data = [] | ||
for index, result in enumerate(search_results): | ||
result_data = { | ||
'index': index + 1, | ||
'url': result.url, | ||
'title': result.title, | ||
'description': result.description | ||
} | ||
if hasattr(result, 'is_sponsored'): | ||
result_data['is_sponsored'] = result.is_sponsored | ||
data.append(result_data) | ||
|
||
df = pd.DataFrame(data) | ||
|
||
return df | ||
|
||
class SearchResult: | ||
def __init__(self, url, title, description): | ||
def __init__(self, url, title, description, is_sponsored): | ||
self.url = url | ||
self.title = title | ||
self.description = description | ||
self.is_sponsored = is_sponsored | ||
|
||
def __repr__(self): | ||
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" | ||
|
||
|
||
def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): | ||
"""Search the Google search engine""" | ||
|
||
if self.is_sponsored: | ||
return f"SearchResult(url={self.url}, title={self.title}, description={self.description}, is_sponsored={self.is_sponsored})" | ||
else: | ||
return f"SearchResult(url={self.url}, title={self.title}, description={self.description})" | ||
|
||
def search(term, sponsored=False, num_results=10, lang="en", proxy=None, advanced=False, sleep_interval=0, timeout=5): | ||
escaped_term = urllib.parse.quote_plus(term) # make 'site:xxx.xxx.xxx ' works. | ||
|
||
# Proxy | ||
|
@@ -52,29 +70,52 @@ def search(term, num_results=10, lang="en", proxy=None, advanced=False, sleep_in | |
start = 0 | ||
while start < num_results: | ||
# Send request | ||
resp = _req(escaped_term, num_results - start, | ||
resp = _req(escaped_term,num_results - start, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why did you delete the space? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could be from formatter. |
||
lang, start, proxies, timeout) | ||
|
||
# Parse | ||
soup = BeautifulSoup(resp.text, "html.parser") | ||
|
||
# Check for sponsored results | ||
if sponsored: | ||
sponsored_block = soup.find_all("div", attrs={"class": "vdQmEd"}) | ||
if len(sponsored_block) == 0: | ||
start += 1 | ||
for sponsored_result in sponsored_block: | ||
link = sponsored_result.find("a", href=True,attrs={"class":"sVXRqc"}).get("href") | ||
title = sponsored_result.find("span", attrs={"class":"OSrXXb"}) | ||
description_box = sponsored_result.find(lambda tag: tag.name == 'span' and not tag.has_attr('class')) | ||
|
||
if description_box: | ||
description = description_box.text | ||
if link and title and description: | ||
start += 1 | ||
if advanced: | ||
yield SearchResult(link, title.text, description, True) | ||
else: | ||
yield link | ||
|
||
# Check for not sponsored results | ||
result_block = soup.find_all("div", attrs={"class": "g"}) | ||
if len(result_block) ==0: | ||
if len(result_block) == 0: | ||
start += 1 | ||
for result in result_block: | ||
# Find link, title, description | ||
link = result.find("a", href=True) | ||
link = result.find("a", href=True).get("href") | ||
title = result.find("h3") | ||
description_box = result.find( | ||
"div", {"style": "-webkit-line-clamp:2"}) | ||
|
||
if description_box: | ||
description = description_box.text | ||
if link and title and description: | ||
start += 1 | ||
if advanced: | ||
yield SearchResult(link["href"], title.text, description) | ||
yield SearchResult(link, title.text, description, False) | ||
else: | ||
yield link["href"] | ||
yield link | ||
|
||
sleep(sleep_interval) | ||
|
||
if start == 0: | ||
return [] | ||
return [] |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
beautifulsoup4>=4.9 | ||
requests>=2.20 | ||
pandas | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Probably add a version just to make sure it works in the future and we have less version issues |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Did you add this to requirements.txt?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just did :)