Skip to content

Commit

Permalink
bug fixed
Browse files Browse the repository at this point in the history
  • Loading branch information
saeid93 committed Dec 13, 2022
1 parent 75f7a94 commit 11709ce
Show file tree
Hide file tree
Showing 13 changed files with 56 additions and 34 deletions.
8 changes: 4 additions & 4 deletions data/configs/generation-configs/cluster.json
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,11 @@
"notes":"small cluster to fix the namings",
"fixed_size_cluster": true,
"nums": {
"nodes": 2,
"services": 60,
"nodes": 4,
"services": 180,
"resources": 2,
"services_types": 3,
"services_types_map": [20, 20, 20]
"services_types_map": [60, 60, 60]
},
"metrics": {
"ram":"mb",
Expand All @@ -31,7 +31,7 @@
"services_request_rng": {
"0":
{
"num": 60,
"num": 180,
"ram": {
"min": 1250,
"max": 2500,
Expand Down
2 changes: 1 addition & 1 deletion data/configs/generation-configs/workload_arabesque.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"workload_type": "arabesque",
"notes": "arabesque dataset usage",
"cluster_id": 5,
"cluster_id": 6,
"num_services":60,
"min_timesteps": 100,
"plot_smoothing":301,
Expand Down
2 changes: 1 addition & 1 deletion data/configs/generation-configs/workload_random.json
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"workload_type": "random",
"notes": "resoure usage",
"cluster_id": 5,
"cluster_id": 7,
"timesteps": 100,
"services_types": 3,
"workloads_var" : {
Expand Down
8 changes: 4 additions & 4 deletions data/configs/train/DQN.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"discrete_actions": false,
"backlog_size": 2,
"seed": 1204,
"target_utilization": {"grid_search": [[0, 0], [0.7, 0.7], [1, 1]]},
"target_utilization": [1, 1],
"job_arrival":{
"mode": "fixed",
"interval": 5
Expand All @@ -48,16 +48,16 @@
},
"run_or_experiment": "DQN",
"learn_config": {
"framework": "torch",
"num_gpus": 1,
"train_batch_size": 200,
"model": {
"fcnet_hiddens": [64, 64],
"fcnet_hiddens": [128, 128, 128, 128],
"fcnet_activation": "linear",
"vf_share_layers": true
},
"gamma": 0.99,
"lr": 0.0003,
"num_workers": 6,
"num_workers": 20,
"observation_filter": "MeanStdFilter",
"seed": 203
},
Expand Down
4 changes: 2 additions & 2 deletions data/configs/train/PG.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@
"discrete_actions": false,
"backlog_size": 2,
"seed": 1204,
"target_utilization": {"grid_search": [[0, 0], [0.7, 0.7], [1, 1]]},
"target_utilization": [1, 1],
"job_arrival":{
"mode": "fixed",
"interval": 5
Expand All @@ -48,7 +48,7 @@
},
"run_or_experiment": "PG",
"learn_config": {
"train_batch_size": 1000,
"train_batch_size": 100,
"num_gpus": 1,
"model": {
"fcnet_hiddens": [64, 64],
Expand Down
6 changes: 3 additions & 3 deletions data/configs/train/PPO.json
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
"reward_var_p_2": 1.05,
"reward_option": "proposed",
"no_action_on_overloaded": true,
"episode_length": 10,
"episode_length": {"grid_search": [10, 20, 50]},
"max_services_nodes": 10,
"discrete_actions": false,
"backlog_size": 2,
"backlog_size": {"grid_search": [2, 4, 8]},
"seed": 1204,
"target_utilization": {"grid_search": [[0, 0], [0.7, 0.7], [1, 1]]},
"target_utilization": [1, 1],
"job_arrival":{
"mode": "fixed",
"interval": 5
Expand Down
10 changes: 3 additions & 7 deletions experiments/analysis/check_env.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ def check_env(*, config: Dict[str, Any], type_env: str,
reward_total = []
while i < total_timesteps:
action = env.action_space.sample()
action = 1
# action = 1
_, reward, done, info = env.step(action)
if info['scheduling_timestep']:
print('scheudling timestep')
Expand All @@ -62,7 +62,7 @@ def check_env(*, config: Dict[str, Any], type_env: str,
# consolidation_rewards.append(consolidation_reward)
reward_total.append(reward)
env.render()
if env.time == 362:
if env.time % 964 == 0:
TEMP = 1
# episode_total_consolidation_reward += consolidation_reward
print("time: {}".format(
Expand Down Expand Up @@ -92,11 +92,7 @@ def check_env(*, config: Dict[str, Any], type_env: str,
'kube-scheduler', 'kube-binpacking',
'CartPole-v0', 'Pendulum-v0']),
default='sim-scheduler')
<<<<<<< HEAD
@click.option('--cluster-id', required=True, type=int, default=5)
=======
@click.option('--cluster-id', required=True, type=int, default=0)
>>>>>>> 1920b661fa3d4ba8714a148b71742bb3e8839f40
@click.option('--cluster-id', required=True, type=int, default=6)
@click.option('--workload-id', required=True, type=int, default=0)
def main(type_env: str, cluster_id: int, workload_id: int):
"""[summary]
Expand Down
5 changes: 4 additions & 1 deletion experiments/cluster_generator/generate_workload.py
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,7 @@ def generate_workload(notes: str, cluster_id: int,
start_workloads=start_workloads,
plot_smoothing=plot_smoothing,
seed=seed)
num_services = len(cluster['services_types'])
workloads, figs = workload_generator.make_workloads()
# information of the generated workload
elif workload_type == 'arabesque':
Expand All @@ -87,6 +88,7 @@ def generate_workload(notes: str, cluster_id: int,
num_services=kwargs['num_services'],
plot_smoothing=plot_smoothing,
seed=seed)
num_services = kwargs['num_services']
workloads, figs = workload_generator.make_workloads()
elif workload_type == 'alibaba':
b = 1
Expand All @@ -95,6 +97,7 @@ def generate_workload(notes: str, cluster_id: int,
'dataest_id': cluster_id,
'plot_smoothing': plot_smoothing,
'workload_type': workload_type,
'num_services': num_services,
'seed': seed
}
workloads_save = {
Expand All @@ -118,7 +121,7 @@ def generate_workload(notes: str, cluster_id: int,
@click.command()
@click.option('--workload-type',
type=click.Choice(
['random', 'arabesque', 'alibaba']), default='arabesque')
['random', 'arabesque', 'alibaba']), default='random')
def main(workload_type: str):
# read the config file
config_file_path = os.path.join(
Expand Down
6 changes: 3 additions & 3 deletions experiments/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ def learner(*, local_mode: bool,
type_env not in ['CartPole-v0', 'Pendulum-v0']:
ray_config.update({'callbacks': CloudCallback})

ray.init(local_mode=local_mode)
ray.init(local_mode=local_mode, num_gpus=1)
# run the ML after fixing the folders structres
_ = tune.run(local_dir=this_experiment_folder,
run_or_experiment=run_or_experiment,
Expand Down Expand Up @@ -144,12 +144,12 @@ def learner(*, local_mode: bool,
@click.command()
@click.option('--local-mode', type=bool, default=False)
@click.option('--config-file', type=str, default='PPO-debug')
@click.option('--series', required=True, type=int, default=2)
@click.option('--series', required=True, type=int, default=3)
@click.option('--type-env', required=True,
type=click.Choice(['sim-scheduler', 'sim-binpacking',
'CartPole-v0', 'Pendulum-v0']),
default='sim-scheduler')
@click.option('--cluster-id', required=True, type=int, default=0)
@click.option('--cluster-id', required=True, type=int, default=5)
@click.option('--workload-id', required=True, type=int, default=0)
@click.option('--use-callback', required=True, type=bool, default=True)
@click.option('--checkpoint-freq', required=False, type=int, default=100)
Expand Down
15 changes: 14 additions & 1 deletion logs.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,18 @@
GOALS:
1. get to a working experiment on larger clusters
- find out why it stops on a certain point
2. run on gpu
- try the other server


Series 1:
started with cluster 5 with 2 servers and train and test on both datasets to setup everything

Series 2:
checking the effect of different target utilizations
checking the effect of different target utilizations

series 4:
Tried PG on a larger dataset with smaller batch size trining

series 5:
Tried DQN to just get to a working state
17 changes: 10 additions & 7 deletions smart_scheduler/src/smart_scheduler/cluster/node.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import numpy as np
from copy import deepcopy
from .service import Service
from typing import List

Expand All @@ -21,9 +22,10 @@ def clock_tick(self):
"""
self.time += 1
list(map(lambda a: a.clock_tick(), self.services))
for service_index, service in enumerate(self.services):
services_copy = deepcopy(self.services)
for service in services_copy:
if service.done:
self.deschedule(service_index)
self.deschedule(service)

def reset_node(self):
self.time = 0
Expand All @@ -42,12 +44,13 @@ def add_service(self, service: Service) -> bool:
self.services.append(service)
return True

def deschedule(self, service_index):
# TODO debug
def deschedule(self, service: Service):
# schedule the service on the node
self.served_services.append(service_index)
# remove the service from the pending services
self.services.pop(service_index)
self.served_services.append(service.service_id)
# remove the service from the node services
for svc_index, svc in enumerate(self.services):
if svc.service_id == service.service_id:
self.services.pop(svc_index)

@property
def requests(self):
Expand Down
5 changes: 5 additions & 0 deletions smart_scheduler/src/smart_scheduler/cluster/service.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,10 @@ def clock_tick(self):
self.time += 1

def start_time_update(self, start_time):
"""
start_time: cluster time that this service has started
time: internal service timestep
"""
self.start_time = start_time
self.time = start_time

Expand All @@ -48,6 +52,7 @@ def slack(self):
@property
def done(self):
if self.start_time + self.duration == self.time:
# if self.duration == self.time:
return True
else:
return False
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,8 @@ def __init__(self, config: Dict[str, Any]):
limits=limits,
workload=service_workload,
serving_time=serving_time))
# import random
# random.shuffle(self.pending_services)

if sim_type == 'arabesque':
self.services_resources_request = np.array(
Expand Down

0 comments on commit 11709ce

Please sign in to comment.