Skip to content

Commit

Permalink
mod dir structure
Browse files Browse the repository at this point in the history
  • Loading branch information
matthewygf committed Apr 4, 2019
1 parent be510d0 commit 90f6660
Show file tree
Hide file tree
Showing 45 changed files with 16,315 additions and 16,630 deletions.
64 changes: 12 additions & 52 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,52 +1,12 @@
# Prerequisites
*.d

# Object files
*.o
*.ko
*.obj
*.elf

# Linker output
*.ilk
*.map
*.exp

# Precompiled Headers
*.gch
*.pch

# Libraries
*.lib
*.a
*.la
*.lo

# Shared objects (inc. Windows DLLs)
*.dll
*.so
*.so.*
*.dylib

# Executables
*.exe
*.out
*.app
*.i*86
*.x86_64
*.hex

# Debug files
*.dSYM/
*.su
*.idb
*.pdb

# Kernel Module Compile Results
*.mod*
*.cmd
.tmp_versions/
modules.order
Module.symvers
Mkfile.old
dkms.conf
*.pyc
result*
__pycache__
test*
plot*
**/.idea
.idea
log/**/*.csv
**/*.log
**/*.json
*.json
*.vscode
122 changes: 61 additions & 61 deletions simulator/60_job.csv → 60_job.csv
Original file line number Diff line number Diff line change
@@ -1,61 +1,61 @@
job_id,num_gpu,submit_time,iterations,model_name,duration,interval
0,1,0,606,vgg19,164,30
1,8,30,133,vgg11,147,23
2,2,53,157,inception4,127,26
3,1,79,513,vgg16,121,20
4,4,99,138,vgg16,121,33
5,8,132,288,inception3,138,44
6,1,176,326,resnet152,121,28
7,2,204,192,resnet152,124,23
8,2,227,235,vgg16,155,25
9,1,252,822,resnet50,126,30
10,1,282,1501,resnet50,230,29
11,1,311,1360,vgg19,368,26
12,2,337,198,vgg19,131,26
13,8,363,581,resnet50,196,38
14,1,401,789,resnet50,121,33
15,1,434,3304,alexnet,180,37
16,4,471,391,alexnet,143,33
17,2,504,193,vgg11,122,27
18,1,531,3469,alexnet,189,23
19,1,554,892,googlenet,125,29
20,1,583,2275,vgg16,536,28
21,2,611,468,alexnet,126,34
22,1,645,473,resnet101,123,35
23,8,680,168,resnet152,121,27
24,4,707,864,googlenet,121,26
25,1,733,253,inception4,121,28
26,1,761,775,vgg11,121,32
27,8,793,109,vgg19,121,34
28,1,827,1085,googlenet,152,34
29,1,861,513,vgg16,121,31
30,1,892,903,vgg11,141,26
31,1,918,864,googlenet,121,26
32,8,944,237,resnet101,125,29
33,1,973,775,vgg11,121,35
34,8,1008,2178,googlenet,305,34
35,4,1042,143,vgg11,121,25
36,4,1067,175,resnet152,123,33
37,1,1100,326,resnet152,121,39
38,2,1139,274,inception3,121,26
39,1,1165,447,vgg19,121,29
40,1,1194,2239,alexnet,122,29
41,2,1223,410,resnet50,123,25
42,1,1248,1011,resnet101,263,26
43,1,1274,438,inception4,209,41
44,1,1315,607,resnet101,158,32
45,2,1347,258,resnet101,121,34
46,1,1381,521,inception3,121,33
47,8,1414,272,alexnet,121,33
48,8,1447,147,inception4,126,24
49,1,1471,4861,resnet152,1800,28
50,4,1499,242,resnet101,126,38
51,1,1537,272,inception4,130,43
52,4,1580,141,vgg19,121,32
53,1,1612,521,inception3,121,32
54,4,1644,362,inception3,171,32
55,8,1676,126,vgg16,135,25
56,2,1701,949,googlenet,133,28
57,4,1729,386,resnet50,128,21
58,4,1750,144,inception4,122,29
59,1,1779,525,inception3,122,33
job_id,num_gpu,submit_time,iterations,model_name,duration,interval
0,1,0,606,vgg19,164,30
1,8,30,133,vgg11,147,23
2,2,53,157,inception4,127,26
3,1,79,513,vgg16,121,20
4,4,99,138,vgg16,121,33
5,8,132,288,inception3,138,44
6,1,176,326,resnet152,121,28
7,2,204,192,resnet152,124,23
8,2,227,235,vgg16,155,25
9,1,252,822,resnet50,126,30
10,1,282,1501,resnet50,230,29
11,1,311,1360,vgg19,368,26
12,2,337,198,vgg19,131,26
13,8,363,581,resnet50,196,38
14,1,401,789,resnet50,121,33
15,1,434,3304,alexnet,180,37
16,4,471,391,alexnet,143,33
17,2,504,193,vgg11,122,27
18,1,531,3469,alexnet,189,23
19,1,554,892,googlenet,125,29
20,1,583,2275,vgg16,536,28
21,2,611,468,alexnet,126,34
22,1,645,473,resnet101,123,35
23,8,680,168,resnet152,121,27
24,4,707,864,googlenet,121,26
25,1,733,253,inception4,121,28
26,1,761,775,vgg11,121,32
27,8,793,109,vgg19,121,34
28,1,827,1085,googlenet,152,34
29,1,861,513,vgg16,121,31
30,1,892,903,vgg11,141,26
31,1,918,864,googlenet,121,26
32,8,944,237,resnet101,125,29
33,1,973,775,vgg11,121,35
34,8,1008,2178,googlenet,305,34
35,4,1042,143,vgg11,121,25
36,4,1067,175,resnet152,123,33
37,1,1100,326,resnet152,121,39
38,2,1139,274,inception3,121,26
39,1,1165,447,vgg19,121,29
40,1,1194,2239,alexnet,122,29
41,2,1223,410,resnet50,123,25
42,1,1248,1011,resnet101,263,26
43,1,1274,438,inception4,209,41
44,1,1315,607,resnet101,158,32
45,2,1347,258,resnet101,121,34
46,1,1381,521,inception3,121,33
47,8,1414,272,alexnet,121,33
48,8,1447,147,inception4,126,24
49,1,1471,4861,resnet152,1800,28
50,4,1499,242,resnet101,126,38
51,1,1537,272,inception4,130,43
52,4,1580,141,vgg19,121,32
53,1,1612,521,inception3,121,32
54,4,1644,362,inception3,171,32
55,8,1676,126,vgg16,135,25
56,2,1701,949,googlenet,133,28
57,4,1729,386,resnet50,128,21
58,4,1750,144,inception4,122,29
59,1,1779,525,inception3,122,33
77 changes: 73 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,77 @@
A GPU Cluster Simulator for Distributed Deep Learning Training using Deep Reinforcement Learning
====
GPU cluster simulator for distributed deep learning training
===
**NOTE**: Currently there are a couple of assumptions:
1. Homogenous cluster set up
2. model gradients transfer is the same as the model size saved in ckpts (model_factory)
3. Parameter Server / Worker frameworks (All-reduce not yet implemented)

**Execution**
1. Before the exection, what's needed?
1. Infrastructure details
Define the hierarchy and resource capacity of the infrastructure in ``cluster_spec.csv``. For example, we have a cluster with 4 racks (switches). Under each rack (switch), there are 32 nodes. And each node has 128 CPU cores, 256 GB memory, and 8 GPUs. Then ``cluster_spec.csv`` will look like this:
```csv
num_switch,num_node_p_switch,num_gpu_p_node,num_cpu_p_node,mem_p_node
4,32,8,128,256
```
2. Job trace
The job trace to simulate. For each job, the simulator needs the following information:
* ``job_id``: for tracking
* ``num_gpu``: gpu requirement
* ``submit_time``: when the job is submitted. The simulator is event-based and discrete-time. Therefore, the time value starts from ``0``, and in second-scale.
* ``iterations``: the number of iterations to training. Used by Network costs calculation when in data parallel jobs.
* ``model_name``: what's the model in that job. This is used to estimate GPU memory usage, and network costs.
* ``duration``: how long this job will run. This information is used to generate job completion event by the simulator.
* ``interval``: job submission interval from this job to the next job


3. How to run the simulator?
A simple example of the execution commend should be:
```
python execute.py
```
Inside the execute file The following options are necessary:
* ``--cluster_spec``: infrastructure spec file
* ``--trace_file``: job trace
* ``--scheme``: **placement scheme**
* ``--schedule``: **scheduler**
What's in this repository?
-----------
Optional inputs:
* ``--print``: print debug information
* ``--log_path``: the output path of the log (cluster, job). The default will be ``time-stamp`` folder under current path
4. What are the placement and scheduling algorithms provided?
*Placement*:
* ``yarn``: get GPUs from the same server nodes under the same switch
*Scheduling*
* ``fifo``
* ``sjf``: Smallest-job-first, in terms of GPU requirement
* **TODO BELOW**
* ``lpjf``: longest pending job first
* ``shorest``: shorestest remaining time job first
* ``shorest-gpu``: shortest-remaining-gputime job first
* ``dlas``: discretized LAS (just time-based)
In ``jobs.py``, you need to specify ``num_queue`` and ``queue_limit`` for ``MLFQ`` (also for ``dlas-gpu``, and ``gittins``)
```python
# Example1: there are two queues, and the threshold for Q1 is 3600 seconds
self.queue_limit = [3600]
# Example2: there are four queues, and the threshold for queues is 3600, 7200, 18000 seconds
self.queue_limit = [3600, 7200, 18000]
```
* ``dlas-gpu``: discretized LAS (gpu-time-based)
* ``gittins``: discretized Gittins Index (gpu-time-based)
5. What's the output?
Based on the ``--log_path``, all the output files are in that folder (e.g., ``result-20190210-12-20-37`` including:
1. ``cluster.csv``: cluster-level resource utilization info at each event point
2. ``jobs.csv``: the job execution information
3. ``cpu.csv``, ``gpu.csv``, ``memory.csv``, ``network.csv``: those are the utilization details of each resource unit at event points. However, those logs are not accurate under some combinations of placement and scheduler. When ``count`` is chosen, those files are not generated.
The output logs are defined in ``log.py``; You can modify that file to adjust the output information.
Others
--------------
[email protected]
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
37 changes: 16 additions & 21 deletions simulator/core/job.py → core/jobs/job.py
Original file line number Diff line number Diff line change
@@ -1,21 +1,10 @@
'''
JOB status:
ADDED: add job into JOBS
EVENT: init job events into event list
PENDING:
RUNNING: running job
END: completed
ERROR
'''
import numpy
import math
from core import util
from core import models
from model import model_factory
import csv
import time
import sys
import os

class Task(object):
"""NOTE:
Expand Down Expand Up @@ -51,16 +40,23 @@ def execute(self):

class Job(object):
"""
NOTE:
NOTE:
Assumption:
1. each GPU is a worker, in reality, this could be different.
2.if number of gpu required by a job is less than 1,
assume only 1 gpu, no worker , no ps.
3. if number of gpu required by a job is greater than 1,
assumed ps is the mod of num_gpu_p_node,
if less than 4, then it is between model replica, no need ps.
4. assume each task (ps, workers) have same amount of cpu.
5. assume each task (ps, workers) have same amount of mem.
1. each GPU is a worker, in reality, this could be different.
2. all job is a parameter server approach.
3. if number of gpu required by a job is less than 1,
assume only 1 gpu, no worker , no ps.
4. if number of gpu required by a job is greater than 1,
assumed ps is the mod of num_gpu_p_node,
if less than 4, then it is between model replica, no need ps.
5. assume each task (ps, workers) have same amount of cpu.
6. assume each task (ps, workers) have same amount of mem.
TODO:
#http://arxiv.org/abs/1807.11205
1. All reduce jobs
#http://arxiv.org/abs/1712.01887
2. Maybe Deep Gradient Compression (DGC)
"""
def __init__(self,
job_id,
Expand All @@ -80,7 +76,6 @@ def __init__(self,
self.submit_time = int(submit_time)
self.pending_time = 0.0
self.model = model
# TODO: Network problem
self.model_size = model_factory.model_sizes[model]
self.migration_count = 0
self.ps_count = gpu // 4 if gpu > 1 else 0
Expand Down
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import os
import csv
from core import util
from core import job
from core.jobs import job

class JobQueueManager(object):
"""A job queue object
that host all the jobs instead of wacky list, or dictionaries"""
"""
A job queue object
that host all the jobs instead of wacky lists, or dictionaries"""
def __init__(self, flags, file_path=None, num_queue=1):
self.flags = flags
if file_path is None:
Expand Down Expand Up @@ -80,19 +81,19 @@ def total_jobs(self, delta_time=-1):
# num += len(q)
return num

def _add(self, queue_idx, job):
def _add(self, queue_idx, new_job):
if self._can_add(queue_idx):
self.queues[queue_idx].append(job)
self.queues[queue_idx].append(new_job)
else:
raise ArithmeticError()

def _add_to_job_queue(self, job, queue_idx=None):
def _add_to_job_queue(self, new_job, queue_idx=None):
"""Args:
queue_idx: if specified, added to specific queue"""
if queue_idx is not None:
self._add(queue_idx, job)
self._add(queue_idx, new_job)
else:
self._add(0, job)
self._add(0, new_job)

def _setup(self):
self.parse_job_file()
Expand Down
2 changes: 2 additions & 0 deletions simulator/core/jobs_manager.py → core/jobs/jobs_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@
class JobsManager(object):
"""
This acts like the Application/Framework master
NOTE: All jobs right now follows parameter server frameworks.
TODO: Allreduce NCCL jobs
"""
def __init__(self, job_queue_manager):
self.job_queue_manager = job_queue_manager
Expand Down
File renamed without changes.
File renamed without changes.
Empty file added core/network/__init__.py
Empty file.
Loading

0 comments on commit 90f6660

Please sign in to comment.