Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added an extention to the filter method #31

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
91 changes: 91 additions & 0 deletions proxyscrape/proxyfilter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#!/usr/bin/python3.8

"""Creats a proxy filter that filters proxies based on
different proxy functions"""

from .proxyscrape import (
create_collector,
get_collector
)
from .shared import Proxy
from .proxyscrape import Collector

import requests
import concurrent.futures

class Proxy_filter():
"""Filters proxies based on different paramaters
"""

def __new__(cls, proxies):
"""Check if the parameters passed are equal before
creating the instance"""
if not isinstance(proxies, list):
return None
if not all(isinstance(proxy, Proxy) for proxy in proxies):
return None
return super().__new__(Proxy_filter)

def __init__(self, proxies):
"""Stores the prox list as a class variable"""
self.__proxies = proxies

def uniqueCountry(self):
"""Returns a list of proxies whose country id unique
to each other"""
proxies = []
countries = []
for proxy in self.__proxies:
if proxy.country not in countries and proxy.country != None:
countries += [proxy.country]
proxies += [proxy]
return proxies

def workingProxies(self, timeout=3, test_url='https://httpbin.org/ip'):
"""Concurrently checks if the proxies are
accessible from your network, and returns a list
of the working once.

@timout: could be set to determine the waiting time, 6 is the default
@test_url: test_url is the site used to test the proxies, 'https://httpbin.org/ip' is the default
"""
proxies = []
def test_proxy(proxy):
"""Takes a proxy object and checks if it works for
test_url in timeout time"""
proxy_port = ":".join([proxy.host, proxy.port])
try:
r = requests.get(test_url, proxies={'http':proxy_port, 'https':proxy_port}, timeout=timeout)
proxies.append(proxy)
except Exception:
pass
with concurrent.futures.ThreadPoolExecutor() as exector:
exector.map(test_proxy, self.__proxies)
return proxies

def get_proxies(self):
return self.__proxies

def set_proxies(self, new_proxy_list):
"""set the __proxies private varaibel"""
if not isinstance(new_proxy_list, list):
return
if not all(isinstance(proxy, Proxy) for proxy in new_proxy_list):
return
self.__proxies = new_proxy_list
proxies = property(get_proxies, set_proxies)


if __name__ == '__main__':
collector = create_collector('Proxy_collector', ['https', 'http'])
proxies = collector.get_proxies()

# filter the working proxies only
filterer = Proxy_filter(proxies)
working_proxies = filterer.workingProxies()

# Filter one ip from each country
filterer.set_proxies(working_proxies)
uniqs = filterer.uniqueCountry()

print(uniqs, len(uniqs))