Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,27 +1,23 @@
# For the proxy error and the cook_soup type specification
import requests
# For nap time if we're nice to the process and the site
from time import sleep
# For site request limit management
import logging
import threading
import time
# To parse the HTML documents
from time import sleep

import requests
from bs4 import BeautifulSoup
# For balancing client requests to the site
from stackoversight.scraping.site_balancer import SiteBalancer


class Site(object):
balancer = None
last_pause_time = None
class AbstractSite(object):

def __init__(self, sessions: list, timeout_sec: int, limit: int):
self.limit = limit
self.timeout_sec = timeout_sec

self.back_off = 0
self.pause_lock = threading.Lock()
self.last_pause_time = None

if not self.balancer:
balancer = SiteBalancer(sessions, timeout_sec, limit)
self.balancer = SiteBalancer(sessions, timeout_sec, limit)

def pause(self, pause_time):
if not pause_time and self.limit:
Expand All @@ -33,13 +29,6 @@ def pause(self, pause_time):
if pause_time < min_pause:
pause_time = min_pause

# can not wait less than the back off field if it is set
if pause_time < self.back_off:
pause_time = self.back_off

# returns the field to zero as it should be set only each time it is returned by the api
self.back_off = 0

# only wait the diff between the time already elapsed from the last request and the pause_time
if self.last_pause_time:
time_elapsed = time.time() - self.last_pause_time
Expand All @@ -50,30 +39,26 @@ def pause(self, pause_time):
else:
pause_time = 0

# initialize the last_pause_time field and sleep
sleep(pause_time)
self.last_pause_time = time.time()

return self.last_pause_time
with self.pause_lock:
# can not wait less than the back off field if it is set
# returns the field to zero as it should be set only each time it is returned by the api
# TODO: eventually this back_off field could be tied to the method called, and only threads using that
# method must wait the extra
back_off = self.clear_back_off()
if pause_time < back_off:
pause_time = back_off

def create_parent_link(self, *args):
raise NotImplementedError
logging.info(f'back_off in {threading.current_thread().getName()} is being handled')

def get_child_links(self, *args):
raise NotImplementedError

def handle_request(self, url, session):
raise NotImplementedError
# initialize the last_pause_time field and sleep
sleep(pause_time)
self.last_pause_time = time.time()

def get_min_pause(self):
raise NotImplementedError
return self.last_pause_time

def process_request(self, url: str, pause=False, pause_time=None):
# TODO: Set this up to wait on a signal from a timer thread so that it isn't a busy wait
# get the next id to use or wait until one is ready
while not self.balancer.is_ready():
sleep(1)
print("Waiting...")
self.balancer.ready.wait()

key = next(self.balancer)

Expand All @@ -85,15 +70,30 @@ def process_request(self, url: str, pause=False, pause_time=None):
try:
response = self.handle_request(url, key)
except:
print("Make sure Archituethis is running or comment out setting the proxy environment variables!\n"
"Could also be an issue with your token?")
logging.critical(f'In {threading.current_thread().getName()} error while requesting {url}, '
f'raising exception.')
raise requests.exceptions.ProxyError

# mark the request as being made
request_count = self.balancer.capture()

return response, key, request_count

def create_parent_link(self, *args):
raise NotImplementedError

def get_child_links(self, *args):
raise NotImplementedError

def handle_request(self, url, session):
raise NotImplementedError

def get_min_pause(self):
raise NotImplementedError

def clear_back_off(self):
raise NotImplementedError

@staticmethod
def cook_soup(response: requests.Response):
return BeautifulSoup(response.text, 'html.parser')
72 changes: 36 additions & 36 deletions stackoversight/scraping/gui.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,36 @@
# For gui
from tkinter import *

# TODO: setup as queue of parent links that this thread will continuously process
root = Tk()

# TODO: show list of processed and to be processed links in this frame
processing_frame = Frame(root)

to_be_processed_label = Label(processing_frame, text="Links to be processed")
has_been_processed_label = Label(processing_frame, text="Links that have been processed")

to_be_processed_label.grid(row=0, column=0)
has_been_processed_label.grid(row=0, column=1)

processing_frame.pack(side=BOTTOM)

# TODO: have all the configurable fields able to be set here
link_generation_frame = Frame(root)

# parent_link_entry = Entry(root)

fields = []
for field in site.fields:
var = StringVar()
label = Label(link_generation_frame, text=field.capitalize())
entry = Entry(link_generation_frame, textvariable=var)

label.grid(row=len(fields), column=0)
entry.grid(row=len(fields), column=1)

fields.append((field, var, label, entry))

link_generation_frame.pack(side=TOP)

root.mainloop()
# # For gui
# from tkinter import *
#
# # TODO: setup as queue of parent links that this thread will continuously process
# root = Tk()
#
# # TODO: show list of processed and to be processed links in this frame
# processing_frame = Frame(root)
#
# to_be_processed_label = Label(processing_frame, text="Links to be processed")
# has_been_processed_label = Label(processing_frame, text="Links that have been processed")
#
# to_be_processed_label.grid(row=0, column=0)
# has_been_processed_label.grid(row=0, column=1)
#
# processing_frame.pack(side=BOTTOM)
#
# # TODO: have all the configurable fields able to be set here
# link_generation_frame = Frame(root)
#
# # parent_link_entry = Entry(root)
#
# fields = []
# for field in site.fields:
# var = StringVar()
# label = Label(link_generation_frame, text=field.capitalize())
# entry = Entry(link_generation_frame, textvariable=var)
#
# label.grid(row=len(fields), column=0)
# entry.grid(row=len(fields), column=1)
#
# fields.append((field, var, label, entry))
#
# link_generation_frame.pack(side=TOP)
#
# root.mainloop()
11 changes: 0 additions & 11 deletions stackoversight/scraping/queue_monitor.py

This file was deleted.

Loading