Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add partial support to spys.one #22

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
119 changes: 115 additions & 4 deletions proxyscrape/scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
from bs4 import BeautifulSoup
from threading import Lock
import time
from collections import OrderedDict
from urllib.parse import urljoin

from .errors import (
InvalidHTMLError,
Expand Down Expand Up @@ -258,6 +260,109 @@ def get_uk_proxies():
raise InvalidHTMLError()


def get_spys_one(verbose=False):
def parse_dict(script):
rows = script.split(";")
new_rows = {}
for row in rows:
if not '=' in row:
continue
cells = row.split("=")
if "^" in cells[1]:
subcells = cells[1].split("^")
subcells[1] = new_rows[subcells[1]]
new_rows[cells[0]] = eval(subcells[0] + "^" + subcells[1], {"__builtins__": None}, {})
else:
new_rows[cells[0]] = cells[1]
return OrderedDict(sorted(new_rows.items(), key=lambda x: len(x[0]), reverse=True))

def parse(soup, source):
proxies = set()
Anonymities = {'ANM', 'HIA'}
ports_dict = parse_dict(soup.select("script")[3].text)
trs = soup.select("tr.spy1xx, tr.spy1x")
for row in trs:
cells = row.select("td")
if not '.' in cells[0].text:
continue
protocol = cells[1].text.split(" ")[0].lower()
host = cells[0].text.split("<", 1)[0].split("document")[0]
port = cells[0].find("script").text.split("+", 1)[1]
for key in ports_dict:
port = port.replace(key, str(ports_dict[key]))
port = eval("(" + port.replace("(", "str("), {"__builtins__": None}, {'str': str})
code = None
country = cells[3].find("a").text
anonymous = cells[2].text in Anonymities
proxies.add(Proxy(host, port, code, country, anonymous, protocol, source))
return proxies

proxies = set()
base_url = 'http://spys.one/en/free-proxy-list/'
# http://spys.one/en/https-ssl-proxy/ is not supported yet
urls = {'http://spys.one/en/free-proxy-list/', 'http://spys.one/en/anonymous-proxy-list/',
'http://spys.one/en/non-anonymous-proxy-list/', 'http://spys.one/en/socks-proxy-list/', 'http://spys.one/en/http-proxy-list/'}
for url in urls:
status = True
while status:
response = request_proxy_list(url)
try:
soup = BeautifulSoup(response.text, 'html.parser')
proxies = proxies.union(parse(soup, 'spys-one'))
font = soup.select("font.spy14")[0]
status = 'Next' in font.text
if status:
url = font.find_parent("a")['href']
url = urljoin(base_url, url)
except (AttributeError, KeyError):
raise InvalidHTMLError()
return proxies


def get_proxynova():
def parse(soup):
table = soup.find('table', {'id': 'tbl_proxy_list'})
proxies = set()
protocol = "http"
for row in table.find('tbody').find_all('tr'):
cells = row.find_all('td')
if not cells[0].find("abbr"):
continue
parts = cells[0].find("abbr").text.split("'")
host = parts[1][8:] + parts[3]
port = cells[1].text.strip()
code = cells[5].find("img")['alt'].lower()
country = cells[5].find("a").text.split("\t")[0].lower()

anonymous = cells[6].find("span").text.lower() in ('anonymous', 'elite')
proxies.add(Proxy(host, port, code, country, anonymous, protocol, 'proxynova'))
return proxies
url = 'https://www.proxynova.com/proxy-server-list/'
proxies = set()
urls = set()
response = request_proxy_list(url)
soup = BeautifulSoup(response.content, 'html.parser')
for a in soup.select("a"):
try:
if 'country-' in a['href'] and int(a.find("span").text[1:-1]) > 0:
urls.add(urljoin(url, a['href']))
except:
pass

status = True
while status:
try:
proxies = proxies.union(parse(soup))
status = len(urls)
if status:
url = urls.pop()
response = request_proxy_list(url)
soup = BeautifulSoup(response.content, 'html.parser')
except (AttributeError, KeyError):
raise InvalidHTMLError()
return proxies


def get_us_proxies():
url = 'https://www.us-proxy.org'
response = request_proxy_list(url)
Expand Down Expand Up @@ -391,7 +496,9 @@ def get_resources():
'socks-proxy': get_socks_proxies,
'ssl-proxy': get_ssl_proxies,
'uk-proxy': get_uk_proxies,
'us-proxy': get_us_proxies
'us-proxy': get_us_proxies,
'spys-one': get_spys_one,
'proxynova': get_proxynova
}

RESOURCE_TYPE_MAP = {
Expand All @@ -400,21 +507,25 @@ def get_resources():
'uk-proxy',
'free-proxy-list',
'proxy-daily-http',
'anonymous-proxy'
'anonymous-proxy',
'spys-one',
'proxynova'
},
'https': {
'us-proxy',
'uk-proxy',
'free-proxy-list',
'ssl-proxy',
'anonymous-proxy'
'anonymous-proxy',
'spys-one'
},
'socks4': {
'socks-proxy',
'proxy-daily-socks4'
},
'socks5': {
'socks-proxy',
'proxy-daily-socks5'
'proxy-daily-socks5',
'spys-one'
}
}
4 changes: 2 additions & 2 deletions proxyscrape/shared.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@
Proxy = namedtuple('Proxy', ['host', 'port', 'code', 'country', 'anonymous', 'type', 'source'])


def request_proxy_list(url):
def request_proxy_list(url, **kwargs):
try:
response = requests.get(url)
response = requests.get(url, **kwargs)
except requests.RequestException:
raise RequestFailedError()

Expand Down