bug fixed

saeid93 · Dec 13, 2022 · 11709ce · 11709ce
1 parent 75f7a94
commit 11709ce
Show file tree

Hide file tree

Showing 13 changed files with 56 additions and 34 deletions.
diff --git a/data/configs/generation-configs/cluster.json b/data/configs/generation-configs/cluster.json
@@ -2,11 +2,11 @@
     "notes":"small cluster to fix the namings",
     "fixed_size_cluster": true,
     "nums": {
-        "nodes": 2,
-        "services": 60,
+        "nodes": 4,
+        "services": 180,
         "resources": 2,
         "services_types": 3,
-        "services_types_map": [20, 20, 20]
+        "services_types_map": [60, 60, 60]
     },
     "metrics": {
         "ram":"mb",
@@ -31,7 +31,7 @@
     "services_request_rng": {
         "0":
             {
-            "num": 60,
+            "num": 180,
             "ram": {
                 "min": 1250,
                 "max": 2500,

diff --git a/data/configs/generation-configs/workload_arabesque.json b/data/configs/generation-configs/workload_arabesque.json
@@ -1,7 +1,7 @@
 {
     "workload_type": "arabesque",
     "notes": "arabesque dataset usage",
-    "cluster_id": 5,
+    "cluster_id": 6,
     "num_services":60,
     "min_timesteps": 100,
     "plot_smoothing":301,

diff --git a/data/configs/generation-configs/workload_random.json b/data/configs/generation-configs/workload_random.json
@@ -1,7 +1,7 @@
 {
     "workload_type": "random",
     "notes": "resoure usage",
-    "cluster_id": 5,
+    "cluster_id": 7,
     "timesteps": 100,
     "services_types": 3,
     "workloads_var" : {

diff --git a/data/configs/train/DQN.json b/data/configs/train/DQN.json
@@ -33,7 +33,7 @@
         "discrete_actions": false,
         "backlog_size": 2,
         "seed": 1204,
-        "target_utilization": {"grid_search": [[0, 0], [0.7, 0.7], [1, 1]]},
+        "target_utilization": [1, 1],
         "job_arrival":{
             "mode": "fixed",
             "interval": 5
@@ -48,16 +48,16 @@
     },
     "run_or_experiment": "DQN",
     "learn_config": {
-        "framework": "torch",
+        "num_gpus": 1,
         "train_batch_size": 200,
         "model": {
-            "fcnet_hiddens": [64, 64],
+            "fcnet_hiddens": [128, 128, 128, 128],
             "fcnet_activation": "linear",
             "vf_share_layers": true
         },
         "gamma": 0.99,
         "lr": 0.0003,
-        "num_workers": 6,
+        "num_workers": 20,
         "observation_filter": "MeanStdFilter",
         "seed": 203
     },

diff --git a/data/configs/train/PG.json b/data/configs/train/PG.json
@@ -33,7 +33,7 @@
         "discrete_actions": false,
         "backlog_size": 2,
         "seed": 1204,
-        "target_utilization": {"grid_search": [[0, 0], [0.7, 0.7], [1, 1]]},
+        "target_utilization": [1, 1],
         "job_arrival":{
             "mode": "fixed",
             "interval": 5
@@ -48,7 +48,7 @@
     },
     "run_or_experiment": "PG",
     "learn_config": {
-        "train_batch_size": 1000,
+        "train_batch_size": 100,
         "num_gpus": 1,
         "model": {
             "fcnet_hiddens": [64, 64],

diff --git a/data/configs/train/PPO.json b/data/configs/train/PPO.json
@@ -28,12 +28,12 @@
         "reward_var_p_2": 1.05,
         "reward_option": "proposed",
         "no_action_on_overloaded": true,
-        "episode_length": 10,
+        "episode_length": {"grid_search": [10, 20, 50]},
         "max_services_nodes": 10, 
         "discrete_actions": false,
-        "backlog_size": 2,
+        "backlog_size": {"grid_search": [2, 4, 8]},
         "seed": 1204,
-        "target_utilization": {"grid_search": [[0, 0], [0.7, 0.7], [1, 1]]},
+        "target_utilization": [1, 1],
         "job_arrival":{
             "mode": "fixed",
             "interval": 5

diff --git a/experiments/analysis/check_env.py b/experiments/analysis/check_env.py
@@ -51,7 +51,7 @@ def check_env(*, config: Dict[str, Any], type_env: str,
     reward_total = []
     while i < total_timesteps:
         action = env.action_space.sample()
-        action = 1
+        # action = 1
         _, reward, done, info = env.step(action)
         if info['scheduling_timestep']:
             print('scheudling timestep')
@@ -62,7 +62,7 @@ def check_env(*, config: Dict[str, Any], type_env: str,
         # consolidation_rewards.append(consolidation_reward)
         reward_total.append(reward)
         env.render()
-        if env.time == 362:
+        if env.time % 964 == 0:
             TEMP = 1
         # episode_total_consolidation_reward += consolidation_reward
         print("time: {}".format(
@@ -92,11 +92,7 @@ def check_env(*, config: Dict[str, Any], type_env: str,
                                  'kube-scheduler', 'kube-binpacking',
                                  'CartPole-v0', 'Pendulum-v0']),
               default='sim-scheduler')
-<<<<<<< HEAD
-@click.option('--cluster-id', required=True, type=int, default=5)
-=======
-@click.option('--cluster-id', required=True, type=int, default=0)
->>>>>>> 1920b661fa3d4ba8714a148b71742bb3e8839f40
+@click.option('--cluster-id', required=True, type=int, default=6)
 @click.option('--workload-id', required=True, type=int, default=0)
 def main(type_env: str, cluster_id: int, workload_id: int):
     """[summary]

diff --git a/experiments/cluster_generator/generate_workload.py b/experiments/cluster_generator/generate_workload.py
@@ -76,6 +76,7 @@ def generate_workload(notes: str, cluster_id: int,
             start_workloads=start_workloads,
             plot_smoothing=plot_smoothing,
             seed=seed)
+        num_services = len(cluster['services_types'])
         workloads, figs = workload_generator.make_workloads()
         # information of the generated workload
     elif workload_type == 'arabesque':
@@ -87,6 +88,7 @@ def generate_workload(notes: str, cluster_id: int,
             num_services=kwargs['num_services'],
             plot_smoothing=plot_smoothing,
             seed=seed)
+        num_services = kwargs['num_services']
         workloads, figs = workload_generator.make_workloads()
     elif workload_type == 'alibaba':
         b = 1
@@ -95,6 +97,7 @@ def generate_workload(notes: str, cluster_id: int,
         'dataest_id': cluster_id,
         'plot_smoothing': plot_smoothing,
         'workload_type': workload_type,
+        'num_services': num_services,
         'seed': seed
     }
     workloads_save = {
@@ -118,7 +121,7 @@ def generate_workload(notes: str, cluster_id: int,
 @click.command()
 @click.option('--workload-type',
               type=click.Choice(
-                  ['random', 'arabesque', 'alibaba']), default='arabesque')
+                  ['random', 'arabesque', 'alibaba']), default='random')
 def main(workload_type: str):
     # read the config file
     config_file_path = os.path.join(

diff --git a/experiments/training/train.py b/experiments/training/train.py
@@ -116,7 +116,7 @@ def learner(*, local_mode: bool,
         type_env not in ['CartPole-v0', 'Pendulum-v0']:
         ray_config.update({'callbacks': CloudCallback})
 
-    ray.init(local_mode=local_mode)
+    ray.init(local_mode=local_mode, num_gpus=1)
     # run the ML after fixing the folders structres
     _ = tune.run(local_dir=this_experiment_folder,
                  run_or_experiment=run_or_experiment,
@@ -144,12 +144,12 @@ def learner(*, local_mode: bool,
 @click.command()
 @click.option('--local-mode', type=bool, default=False)
 @click.option('--config-file', type=str, default='PPO-debug')
-@click.option('--series', required=True, type=int, default=2)
+@click.option('--series', required=True, type=int, default=3)
 @click.option('--type-env', required=True,
               type=click.Choice(['sim-scheduler', 'sim-binpacking',
                                  'CartPole-v0', 'Pendulum-v0']),
               default='sim-scheduler')
-@click.option('--cluster-id', required=True, type=int, default=0)
+@click.option('--cluster-id', required=True, type=int, default=5)
 @click.option('--workload-id', required=True, type=int, default=0)
 @click.option('--use-callback', required=True, type=bool, default=True)
 @click.option('--checkpoint-freq', required=False, type=int, default=100)

diff --git a/logs.txt b/logs.txt
@@ -1,5 +1,18 @@
+GOALS:
+1. get to a working experiment on larger clusters
+    - find out why it stops on a certain point
+2. run on gpu
+    - try the other server
+
+
 Series 1:
 started with cluster 5 with 2 servers and train and test on both datasets to setup everything
 
 Series 2:
-checking the effect of different target utilizations
+checking the effect of different target utilizations
+
+series 4:
+Tried PG on a larger dataset with smaller batch size trining
+
+series 5:
+Tried DQN to just get to a working state
diff --git a/smart_scheduler/src/smart_scheduler/cluster/node.py b/smart_scheduler/src/smart_scheduler/cluster/node.py
@@ -1,4 +1,5 @@
 import numpy as np
+from copy import deepcopy
 from .service import Service
 from typing import List
 
@@ -21,9 +22,10 @@ def clock_tick(self):
         """
         self.time += 1
         list(map(lambda a: a.clock_tick(), self.services))
-        for service_index, service in enumerate(self.services):
+        services_copy = deepcopy(self.services)
+        for service in services_copy:
             if service.done:
-                self.deschedule(service_index)
+                self.deschedule(service)
 
     def reset_node(self):
         self.time = 0
@@ -42,12 +44,13 @@ def add_service(self, service: Service) -> bool:
         self.services.append(service)
         return True
 
-    def deschedule(self, service_index):
-        # TODO debug
+    def deschedule(self, service: Service):
         # schedule the service on the node
-        self.served_services.append(service_index)
-        # remove the service from the pending services
-        self.services.pop(service_index)
+        self.served_services.append(service.service_id)
+        # remove the service from the node services
+        for svc_index, svc in enumerate(self.services):
+            if svc.service_id == service.service_id:
+                self.services.pop(svc_index)
 
     @property
     def requests(self):

diff --git a/smart_scheduler/src/smart_scheduler/cluster/service.py b/smart_scheduler/src/smart_scheduler/cluster/service.py
@@ -25,6 +25,10 @@ def clock_tick(self):
         self.time += 1
 
     def start_time_update(self, start_time):
+        """
+        start_time: cluster time that this service has started
+        time: internal service timestep
+        """
         self.start_time = start_time
         self.time = start_time
 
@@ -48,6 +52,7 @@ def slack(self):
     @property
     def done(self):
         if self.start_time + self.duration == self.time:
+        # if self.duration == self.time:
             return True
         else:
             return False

diff --git a/smart_scheduler/src/smart_scheduler/envs/simulation/sim_scheduler_env.py b/smart_scheduler/src/smart_scheduler/envs/simulation/sim_scheduler_env.py
@@ -221,6 +221,8 @@ def __init__(self, config: Dict[str, Any]):
                 limits=limits,
                 workload=service_workload,
                 serving_time=serving_time))
+            # import random
+            # random.shuffle(self.pending_services)
 
         if sim_type == 'arabesque':
             self.services_resources_request = np.array(