Downscale extra machines if you have them. (#95)

bethac07 · web-flow · commit 917922c94488 · 2019-09-04T12:05:21.000-04:00
* Update run.py

* Update run.py

* Update run.py

* Only scale down, not up

* Update run.py

* Update run.py

* Update run.py

* Update run.py

* Update run.py

* Update run.py
diff --git a/run.py b/run.py
@@ -19,7 +19,13 @@
 def getAWSJsonOutput(cmd):
 	process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)
 	out, err = process.communicate()
-	requestInfo = json.loads(out)
+	try:
+	    requestInfo = json.loads(out)
+	except ValueError as e:
+	    if str(e) == "No JSON object could be decoded":
+		requestInfo = out
+	    else:
+		requestInfo = out + err	
 	return requestInfo
 
 def loadConfig(configFile):
@@ -114,6 +120,17 @@ def removeClusterIfUnused(clusterName):
 	if sum([result['clusters'][0]['pendingTasksCount'],result['clusters'][0]['runningTasksCount'],result['clusters'][0]['activeServicesCount']])==0:
 	    cmd = 'aws ecs delete-cluster --cluster '+clusterName
 	    result=getAWSJsonOutput(cmd)
+		
+def downscaleSpotFleet(queue, spotFleetID):
+    visible, nonvisible = queue.returnLoad()
+    if visible > 0:
+        return
+    else:
+	cmd = 'aws ec2 describe-spot-fleet-instances --spot-fleet-request-id ' + spotFleetID
+        status = getAWSJsonOutput(cmd)
+	if nonvisible < len(status['ActiveInstances']):
+            cmd="aws ec2 modify-spot-fleet-request --excess-capacity-termination-policy NoTermination --target-capacity " + str(nonvisible)+ " --spot-fleet-request-id " + spotFleetID
+	    result=getAWSJsonOutput(cmd)
 	
 #################################
 # CLASS TO HANDLE SQS QUEUE
@@ -149,6 +166,12 @@ def pendingLoad(self):
         else:
             return False
 
+    def returnLoad(self):
+        self.queue.load()
+        visible = int( self.queue.attributes['ApproximateNumberOfMessages'] )
+        nonVis = int( self.queue.attributes['ApproximateNumberOfMessagesNotVisible'] )
+	return visible, nonVis
+
 
 #################################
 # SERVICE 1: SUBMIT JOB
@@ -278,11 +301,17 @@ def monitor():
     	# Step 1: Create job and count messages periodically
     queue = JobQueue(name=queueId)
     while queue.pendingLoad():
-	#Once a day check for terminated machines and delete their alarms.  
-	#These records are only kept for 48 hours, which is why we don't just do it at the end
+	#Once an hour check for terminated machines and delete their alarms.  
+	#This is slooooooow, which is why we don't just do it at the end
         curtime=datetime.datetime.now().strftime('%H%M')
-        if curtime=='1200':
+        if curtime[-2:]=='00':
             killdeadAlarms(fleetId,monitorapp)
+	#Once every 10 minutes, check if all jobs are in process, and if so scale the spot fleet size to match
+	#the number of jobs still in process WITHOUT force terminating them.
+	#This can help keep costs down if, for example, you start up 100+ machines to run a large job, and
+	#1-10 jobs with errors are keeping it rattling around for hours.
+	if curtime[-1:]=='9':
+	    downscaleSpotFleet(queue, fleetId)
         time.sleep(MONITOR_TIME)
 	
 	# Step 2: When no messages are pending, stop service
@@ -309,6 +338,7 @@ def monitor():
     for eachinstance in result['ActiveInstances']:
         delalarm='aws cloudwatch delete-alarms --alarm-name '+monitorapp+'_'+eachinstance["InstanceId"]
         subprocess.Popen(delalarm.split())
+	time.sleep(3)
     killdeadAlarms(fleetId,monitorapp)
 	
 	# Step 4: Read spot fleet id and terminate all EC2 instances