mod dir structure

Babbleshack · Apr 4, 2019 · 90f6660 · 90f6660
1 parent be510d0
commit 90f6660
Show file tree

Hide file tree

Showing 45 changed files with 16,315 additions and 16,630 deletions.
diff --git a/.gitignore b/.gitignore
@@ -1,52 +1,12 @@
-# Prerequisites
-*.d
-
-# Object files
-*.o
-*.ko
-*.obj
-*.elf
-
-# Linker output
-*.ilk
-*.map
-*.exp
-
-# Precompiled Headers
-*.gch
-*.pch
-
-# Libraries
-*.lib
-*.a
-*.la
-*.lo
-
-# Shared objects (inc. Windows DLLs)
-*.dll
-*.so
-*.so.*
-*.dylib
-
-# Executables
-*.exe
-*.out
-*.app
-*.i*86
-*.x86_64
-*.hex
-
-# Debug files
-*.dSYM/
-*.su
-*.idb
-*.pdb
-
-# Kernel Module Compile Results
-*.mod*
-*.cmd
-.tmp_versions/
-modules.order
-Module.symvers
-Mkfile.old
-dkms.conf
+*.pyc
+result*
+__pycache__
+test*
+plot*
+**/.idea
+.idea
+log/**/*.csv
+**/*.log
+**/*.json
+*.json
+*.vscode
diff --git a/simulator/60_job.csv → 60_job.csv b/simulator/60_job.csv → 60_job.csv
@@ -1,61 +1,61 @@
-job_id,num_gpu,submit_time,iterations,model_name,duration,interval
-0,1,0,606,vgg19,164,30
-1,8,30,133,vgg11,147,23
-2,2,53,157,inception4,127,26
-3,1,79,513,vgg16,121,20
-4,4,99,138,vgg16,121,33
-5,8,132,288,inception3,138,44
-6,1,176,326,resnet152,121,28
-7,2,204,192,resnet152,124,23
-8,2,227,235,vgg16,155,25
-9,1,252,822,resnet50,126,30
-10,1,282,1501,resnet50,230,29
-11,1,311,1360,vgg19,368,26
-12,2,337,198,vgg19,131,26
-13,8,363,581,resnet50,196,38
-14,1,401,789,resnet50,121,33
-15,1,434,3304,alexnet,180,37
-16,4,471,391,alexnet,143,33
-17,2,504,193,vgg11,122,27
-18,1,531,3469,alexnet,189,23
-19,1,554,892,googlenet,125,29
-20,1,583,2275,vgg16,536,28
-21,2,611,468,alexnet,126,34
-22,1,645,473,resnet101,123,35
-23,8,680,168,resnet152,121,27
-24,4,707,864,googlenet,121,26
-25,1,733,253,inception4,121,28
-26,1,761,775,vgg11,121,32
-27,8,793,109,vgg19,121,34
-28,1,827,1085,googlenet,152,34
-29,1,861,513,vgg16,121,31
-30,1,892,903,vgg11,141,26
-31,1,918,864,googlenet,121,26
-32,8,944,237,resnet101,125,29
-33,1,973,775,vgg11,121,35
-34,8,1008,2178,googlenet,305,34
-35,4,1042,143,vgg11,121,25
-36,4,1067,175,resnet152,123,33
-37,1,1100,326,resnet152,121,39
-38,2,1139,274,inception3,121,26
-39,1,1165,447,vgg19,121,29
-40,1,1194,2239,alexnet,122,29
-41,2,1223,410,resnet50,123,25
-42,1,1248,1011,resnet101,263,26
-43,1,1274,438,inception4,209,41
-44,1,1315,607,resnet101,158,32
-45,2,1347,258,resnet101,121,34
-46,1,1381,521,inception3,121,33
-47,8,1414,272,alexnet,121,33
-48,8,1447,147,inception4,126,24
-49,1,1471,4861,resnet152,1800,28
-50,4,1499,242,resnet101,126,38
-51,1,1537,272,inception4,130,43
-52,4,1580,141,vgg19,121,32
-53,1,1612,521,inception3,121,32
-54,4,1644,362,inception3,171,32
-55,8,1676,126,vgg16,135,25
-56,2,1701,949,googlenet,133,28
-57,4,1729,386,resnet50,128,21
-58,4,1750,144,inception4,122,29
-59,1,1779,525,inception3,122,33
+job_id,num_gpu,submit_time,iterations,model_name,duration,interval
+0,1,0,606,vgg19,164,30
+1,8,30,133,vgg11,147,23
+2,2,53,157,inception4,127,26
+3,1,79,513,vgg16,121,20
+4,4,99,138,vgg16,121,33
+5,8,132,288,inception3,138,44
+6,1,176,326,resnet152,121,28
+7,2,204,192,resnet152,124,23
+8,2,227,235,vgg16,155,25
+9,1,252,822,resnet50,126,30
+10,1,282,1501,resnet50,230,29
+11,1,311,1360,vgg19,368,26
+12,2,337,198,vgg19,131,26
+13,8,363,581,resnet50,196,38
+14,1,401,789,resnet50,121,33
+15,1,434,3304,alexnet,180,37
+16,4,471,391,alexnet,143,33
+17,2,504,193,vgg11,122,27
+18,1,531,3469,alexnet,189,23
+19,1,554,892,googlenet,125,29
+20,1,583,2275,vgg16,536,28
+21,2,611,468,alexnet,126,34
+22,1,645,473,resnet101,123,35
+23,8,680,168,resnet152,121,27
+24,4,707,864,googlenet,121,26
+25,1,733,253,inception4,121,28
+26,1,761,775,vgg11,121,32
+27,8,793,109,vgg19,121,34
+28,1,827,1085,googlenet,152,34
+29,1,861,513,vgg16,121,31
+30,1,892,903,vgg11,141,26
+31,1,918,864,googlenet,121,26
+32,8,944,237,resnet101,125,29
+33,1,973,775,vgg11,121,35
+34,8,1008,2178,googlenet,305,34
+35,4,1042,143,vgg11,121,25
+36,4,1067,175,resnet152,123,33
+37,1,1100,326,resnet152,121,39
+38,2,1139,274,inception3,121,26
+39,1,1165,447,vgg19,121,29
+40,1,1194,2239,alexnet,122,29
+41,2,1223,410,resnet50,123,25
+42,1,1248,1011,resnet101,263,26
+43,1,1274,438,inception4,209,41
+44,1,1315,607,resnet101,158,32
+45,2,1347,258,resnet101,121,34
+46,1,1381,521,inception3,121,33
+47,8,1414,272,alexnet,121,33
+48,8,1447,147,inception4,126,24
+49,1,1471,4861,resnet152,1800,28
+50,4,1499,242,resnet101,126,38
+51,1,1537,272,inception4,130,43
+52,4,1580,141,vgg19,121,32
+53,1,1612,521,inception3,121,32
+54,4,1644,362,inception3,171,32
+55,8,1676,126,vgg16,135,25
+56,2,1701,949,googlenet,133,28
+57,4,1729,386,resnet50,128,21
+58,4,1750,144,inception4,122,29
+59,1,1779,525,inception3,122,33
diff --git a/README.md b/README.md
@@ -1,8 +1,77 @@
-A GPU Cluster Simulator for Distributed Deep Learning Training using Deep Reinforcement Learning
-====
+GPU cluster simulator for distributed deep learning training
+===
+**NOTE**: Currently there are a couple of assumptions:
+1. Homogenous cluster set up
+2. model gradients transfer is the same as the model size saved in ckpts (model_factory)
+3. Parameter Server / Worker frameworks (All-reduce not yet implemented)
 
+**Execution**
+1. Before the exection, what's needed?
+    1. Infrastructure details
+    Define the hierarchy and resource capacity of the infrastructure in ``cluster_spec.csv``. For example, we have a cluster with 4 racks (switches). Under each rack (switch), there are 32 nodes. And each node has 128 CPU cores, 256 GB memory, and 8 GPUs. Then ``cluster_spec.csv`` will look like this:
+        ```csv
+        num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node
+        4,32,8,128,256
+        ```
+    2. Job trace
+    The job trace to simulate. For each job, the simulator needs the following information:
+       * ``job_id``: for tracking
+       * ``num_gpu``: gpu requirement
+       * ``submit_time``: when the job is submitted. The simulator is event-based and discrete-time. Therefore, the time value starts from ``0``, and in second-scale.
+       * ``iterations``: the number of iterations to training. Used by Network costs calculation when in data parallel jobs.
+       * ``model_name``: what's the model in that job. This is used to estimate GPU memory usage, and network costs.
+       * ``duration``: how long this job will run. This information is used to generate job completion event by the simulator.
+       * ``interval``: job submission interval from this job to the next job
+
 
+3. How to run the simulator?
+    A simple example of the execution commend should be:
+    ```
+    python execute.py
+    ```
+    Inside the execute file The following options are necessary:
+    * ``--cluster_spec``: infrastructure spec file
+    * ``--trace_file``: job trace
+    * ``--scheme``: **placement scheme**
+    * ``--schedule``: **scheduler**
 
-What's in this repository?
------------
+    Optional inputs:
+    * ``--print``: print debug information
+    * ``--log_path``: the output path of the log (cluster, job). The default will be ``time-stamp`` folder under current path
 
+4. What are the placement and scheduling algorithms provided?
+    *Placement*: 
+    * ``yarn``: get GPUs from the same server nodes under the same switch
+
+    *Scheduling*
+    * ``fifo``
+    * ``sjf``: Smallest-job-first, in terms of GPU requirement
+    * **TODO BELOW**
+    * ``lpjf``: longest pending job first
+    * ``shorest``: shorestest remaining time job first
+    * ``shorest-gpu``: shortest-remaining-gputime job first
+    * ``dlas``: discretized LAS (just time-based)
+        In ``jobs.py``,  you need to specify ``num_queue`` and ``queue_limit`` for ``MLFQ`` (also for ``dlas-gpu``, and ``gittins``)
+        ```python
+        # Example1: there are two queues, and the threshold for Q1 is 3600 seconds
+        self.queue_limit = [3600]
+
+        # Example2: there are four queues, and the threshold for queues is 3600, 7200, 18000 seconds
+        self.queue_limit = [3600, 7200, 18000]
+        ```
+    * ``dlas-gpu``: discretized LAS (gpu-time-based)
+    * ``gittins``: discretized Gittins Index (gpu-time-based)
+
+
+5. What's the output?
+    Based on the ``--log_path``, all the output files are in that folder (e.g., ``result-20190210-12-20-37`` including:
+    1. ``cluster.csv``: cluster-level resource utilization info at each event point
+    2. ``jobs.csv``: the job execution information
+    3. ``cpu.csv``, ``gpu.csv``, ``memory.csv``, ``network.csv``: those are the utilization details of each resource unit at event points. However, those logs are not accurate under some combinations of placement and scheduler. When ``count`` is chosen, those files are not generated.
+
+    The output logs are defined in ``log.py``; You can modify that file to adjust the output information.
+
+
+Others
+--------------
+[email protected]
diff --git a/simulator/core/__init__.py → __init__.py b/simulator/core/__init__.py → __init__.py
diff --git a/simulator/cluster_spec.csv → cluster_spec.csv b/simulator/cluster_spec.csv → cluster_spec.csv
diff --git a/simulator/infra/__init__.py → core/__init__.py b/simulator/infra/__init__.py → core/__init__.py
diff --git a/simulator/core/flags.py → core/flags.py b/simulator/core/flags.py → core/flags.py
diff --git a/simulator/model/__init__.py → core/jobs/__init__.py b/simulator/model/__init__.py → core/jobs/__init__.py
diff --git a/simulator/core/job.py → core/jobs/job.py b/simulator/core/job.py → core/jobs/job.py
@@ -1,21 +1,10 @@
-'''
-JOB status:
-ADDED: add job into JOBS
-EVENT: init job events into event list
-PENDING:
-RUNNING: running job
-END: completed
-ERROR
-'''
-import numpy
 import math
 from core import util
 from core import models
 from model import model_factory
 import csv
 import time
 import sys
-import os
 
 class Task(object):
     """NOTE: 
@@ -51,16 +40,23 @@ def execute(self):
 
 class Job(object):
     """
-    NOTE: 
+    NOTE:
     Assumption:
-    1. each GPU is a worker, in reality, this could be different. 
-    2.if number of gpu required by a job is less than 1, 
-    assume only 1 gpu, no worker , no ps.
-    3. if number of gpu required by a job is greater than 1, 
-    assumed ps is the mod of num_gpu_p_node, 
-    if less than 4, then it is between model replica, no need ps.
-    4. assume each task (ps, workers) have same amount of cpu.
-    5. assume each task (ps, workers) have same amount of mem.
+    1. each GPU is a worker, in reality, this could be different.
+    2. all job is a parameter server approach.
+    3. if number of gpu required by a job is less than 1,
+        assume only 1 gpu, no worker , no ps.
+    4. if number of gpu required by a job is greater than 1,
+        assumed ps is the mod of num_gpu_p_node,
+        if less than 4, then it is between model replica, no need ps.
+    5. assume each task (ps, workers) have same amount of cpu.
+    6. assume each task (ps, workers) have same amount of mem.
+    TODO:
+    #http://arxiv.org/abs/1807.11205
+    1. All reduce jobs
+
+    #http://arxiv.org/abs/1712.01887
+    2. Maybe Deep Gradient Compression (DGC)
     """
     def __init__(self, 
                  job_id, 
@@ -80,7 +76,6 @@ def __init__(self,
         self.submit_time = int(submit_time)
         self.pending_time = 0.0
         self.model = model
-        # TODO: Network problem
         self.model_size = model_factory.model_sizes[model]
         self.migration_count = 0
         self.ps_count = gpu // 4 if gpu > 1 else 0

diff --git a/simulator/core/job_queue_manager.py → core/jobs/job_queue_manager.py b/simulator/core/job_queue_manager.py → core/jobs/job_queue_manager.py
@@ -1,11 +1,12 @@
 import os
 import csv
 from core import util
-from core import job
+from core.jobs import job
 
 class JobQueueManager(object):
-    """A job queue object 
-    that host all the jobs instead of wacky list, or dictionaries"""
+    """
+    A job queue object
+    that host all the jobs instead of wacky lists, or dictionaries"""
     def __init__(self, flags, file_path=None, num_queue=1): 
         self.flags = flags
         if file_path is None:
@@ -80,19 +81,19 @@ def total_jobs(self, delta_time=-1):
         #         num += len(q)
         return num
 
-    def _add(self, queue_idx, job):
+    def _add(self, queue_idx, new_job):
         if self._can_add(queue_idx):
-            self.queues[queue_idx].append(job)
+            self.queues[queue_idx].append(new_job)
         else:
             raise ArithmeticError()
 
-    def _add_to_job_queue(self, job, queue_idx=None):
+    def _add_to_job_queue(self, new_job, queue_idx=None):
         """Args:
             queue_idx: if specified, added to specific queue"""
         if queue_idx is not None:
-            self._add(queue_idx, job)
+            self._add(queue_idx, new_job)
         else:
-            self._add(0, job)
+            self._add(0, new_job)
 
     def _setup(self):
         self.parse_job_file()

diff --git a/simulator/core/jobs_manager.py → core/jobs/jobs_manager.py b/simulator/core/jobs_manager.py → core/jobs/jobs_manager.py
@@ -2,6 +2,8 @@
 class JobsManager(object):
     """
     This acts like the Application/Framework master
+    NOTE: All jobs right now follows parameter server frameworks.
+    TODO: Allreduce NCCL jobs
     """
     def __init__(self, job_queue_manager):
         self.job_queue_manager = job_queue_manager

diff --git a/simulator/core/lp.py → core/lp.py b/simulator/core/lp.py → core/lp.py
diff --git a/simulator/core/models.py → core/models.py b/simulator/core/models.py → core/models.py
diff --git a/core/network/__init__.py b/core/network/__init__.py