-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler2_entry.py
120 lines (108 loc) · 4.01 KB
/
crawler2_entry.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/python
#This is a comment
import sys, urllib, time, urlparse, mechanize, json, bottle, os, logging, subprocess
import uuid, OpenSSL
from getopt import getopt
from bottle import route, run, request, abort
from pymongo import Connection
from threading import Thread
env = {'MONGODB_HOST': localhost, 'MONGODB_PORT': 24077, \
'dbuser': root, 'dbpass': '123qwe', \
'python_path': '/usr/bin/env python', \
'post_process_path': 'crawler2.py'}
# CORE OBJECTS
connection = Connection(env['MONGODB_HOST'], int(env['MONGODB_PORT']))
db_handle = connection.admin
db_handle.authenticate(env['dbuser'], env['dbpass'])
db_handle = db_handle.urls
# LOGGING CONFIG
logging.basicConfig(level=logging.DEBUG)
class PostThread(Thread):
def __init__(self, job_id, urls):
self.job_id = job_id
self.urls = urls
Thread.__init__(self)
def run(self):
sp_call = python_path + ' ' + post_process_path
if self.job_id:
sp_call = sp_call + ' ' + str(self.job_id)
else:
logging.error('Error: No Job ID')
sys.exit()
if self.urls:
for url in self.urls:
sp_call = sp_call + ' ' + str(url)
else:
logging.error('Error: no URLs')
sys.exit()
print "Post thread starting with options %s" % sp_call
subprocess.call(sp_call, shell=True)
@route('/result/<job_id>', method='GET')
def getJobResults(job_id):
logging.info('New GET results request received for job %s' % job_id)
response = {}
response['img_urls'] = []
urls_crawled = db_handle.find({'job_id':job_id, 'keep_track_queue': {'$exists': 0}, 'keep_track_crawl': {'$exists':0}})
#try:
for url in urls_crawled:
for image_url in url['img_links']:
response['img_urls'].append(image_url)
#except:
#response['img_urls'] = []
return json.dumps(response)
# REST API - GET STATUS OF JOB
# returns a json document containing the number of urls
# crawled and waiting to be crawled for a certain job_id.
# Note that the number of urls waiting to be crawled will
# change quickly up and down as new child URLs are added
# and then processed
@route('/status/<job_id>', method='GET')
def getJobStatus(job_id):
logging.info('New GET status request received for job %s' % job_id)
response = {}
crawl_count = db_handle.find_one({'job_id':job_id, 'keep_track_crawl': 1, 'crawl_count': {'$exists': 1}})
response['crawl_count'] = crawl_count['crawl_count']
queue_count = db_handle.find_one({'job_id':job_id, 'keep_track_queue': 1, 'queue_count': {'$exists': 1}})
response['queue_count'] = queue_count['queue_count']
return json.dumps(response)
# REST API - POST NEW JOB
# requires a json document to be posted containing a list of
# URL's to be crawled. Assigns a unique job ID and initiates
# a new job thread, then returns the job ID
@route('/', method='POST')
def postNewJob():
received_data = request.body.readline()
logging.debug('New POST request received.')
logging.debug('Running crawler2')
# ERROR CHECKING ON RECEIVED REQUEST
if not received_data:
abort(400, 'No data received')
try:
processed_data = json.loads(received_data)
urls = processed_data['urls']
except:
abort(400, "Request body requires a 'urls' key and value list of urls")
try:
for url in processed_data['urls']:
print url
u = urlparse.urlparse(url)
if u.scheme != 'http':
abort(400, "Only HTTP protocol supported.")
if not u.netloc:
abort(400, "Bad URL value list netloc")
except:
abort(400, "Bad URL value list: can't parse")
urls = processed_data['urls']
# Assign new unique Job ID
job_id = str(uuid.UUID(bytes = OpenSSL.rand.bytes(16)))[:7]
while job_id in db_handle.find({}, {'job_id':1, '_id':0}):
job_id = str(uuid.UUID(bytes = OpenSSL.rand.bytes(16)))[:7]
logging.info('New Job ID Created: %s' % job_id)
pt = PostThread(job_id, urls)
pt.start()
# Close HTTP session, return Job ID
time.sleep(1)
return 'Your job ID is %s' %job_id
if __name__ == "__main__":
logging.info('Attempting to start webserver listening on 8080...')
run(host='33.33.33.10', port=8080)