-
Notifications
You must be signed in to change notification settings - Fork 4
Expand file tree
/
Copy pathScraper.py
More file actions
39 lines (35 loc) · 989 Bytes
/
Scraper.py
File metadata and controls
39 lines (35 loc) · 989 Bytes
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
import threading
import Queue
from time import sleep
import urllib
class ScrapeThread(threading.Thread):
def __init__(self, queue, out_queue, bucket, proxy, proxy_port, worker):
threading.Thread.__init__(self)
self.proxy = proxy
self.proxy_port = proxy_port
self.queue = queue
self.bucket = bucket
self.out_queue = out_queue
self.worker = worker
def run(self):
while True:
if self.queue.empty():
sleep(0.2)
if self.worker.isAlive():
continue
else:
return True
else:
try:
url = self.queue.get_nowait()
except Queue.Empty:
sleep(0.2)
continue
self.read_url(url)
self.queue.task_done()
def read_url(self,url,proxy={}):
if self.proxy:
proxy['http'] = 'http://' + str(self.proxy) + ":" + str(self.proxy_port)
html = urllib.urlopen(url.geturl(),proxies=proxy).read()
self.out_queue.put(html)
self.bucket.put(url)