Skip to content

Commit

Permalink
SINGA-45 Set openblas num threads in job configuration
Browse files Browse the repository at this point in the history
test and merge to master
  • Loading branch information
wangsheng1001 committed Aug 13, 2015
2 parents 2da5e23 + 2c7edd7 commit da844af
Show file tree
Hide file tree
Showing 12 changed files with 156 additions and 104 deletions.
2 changes: 1 addition & 1 deletion bin/singa-cleanup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,5 @@ $SINGA_BIN/singa-stop.sh || exit 1

# close zookeeper
if [ $SINGA_MANAGES_ZK = true ]; then
$SINGA_BIN/zk-service.sh stop
$SINGA_BIN/zk-service.sh stop || exit 1
fi
14 changes: 6 additions & 8 deletions bin/singa-console.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,10 @@
# console to list/view/kill singa jobs
#

usage="Usage:\n
# singa-console.sh list : list running singa jobs\n
# singa-console.sh view JOB_ID : view procs of a singa job\n
# singa-console.sh kill JOB_ID : kill a singa job"
usage="Usage: singa-console.sh <command> <args>\n
list : list running singa jobs\n
view JOB_ID : view procs of a singa job\n
kill JOB_ID : kill a singa job"

if [ $# == 0 ]; then
echo -e $usage
Expand Down Expand Up @@ -59,12 +59,11 @@ case $1 in
echo -e $usage
exit 1
fi
host_file="job-$2.tmp"
./singatool view $2 1>$host_file || exit 1
hosts=`./singatool view "$2"`
[ $? == 0 ] || exit 1
ssh_options="-oStrictHostKeyChecking=no \
-oUserKnownHostsFile=/dev/null \
-oLogLevel=quiet"
hosts=`cat $host_file | cut -d ' ' -f 1`
if [ `head -1 "$SINGA_CONF"/hostfile` == localhost ]; then
local_procs=1
fi
Expand All @@ -79,7 +78,6 @@ case $1 in
$singa_kill
fi
done
rm $host_file
./singatool clean $2 || exit 1
;;

Expand Down
13 changes: 13 additions & 0 deletions bin/singa-env.sh
Original file line number Diff line number Diff line change
Expand Up @@ -24,10 +24,14 @@
# * SINGA_HOME
# * SINGA_BIN
# * SINGA_CONF
# * SINGA_LOG
# * ZK_HOME
# * SINGA_MANAGES_ZK
#

# exit if varaiables already set
[ -z $SINGA_ENV_DONE ] || exit 0

# set SINGA_BIN
if [ -z $SINGA_BIN ]; then
SINGA_BIN=`dirname "${BASH_SOURCE-$0}"`
Expand All @@ -44,6 +48,13 @@ if [ -z $SINGA_CONF ]; then
SINGA_CONF=$SINGA_HOME/conf
fi

# set SINGA_LOG
if [ -z $SINGA_LOG ]; then
# add -global arg, so no need to run under SINGA_HOME
SINGA_LOG=`"$SINGA_HOME"/singatool getlogdir -global="$SINGA_CONF"/singa.conf`
[ $? == 0 ] || exit 1
fi

# set ZK_HOME
if [ -z $ZK_HOME ]; then
ZK_HOME=$SINGA_HOME/thirdparty/zookeeper-3.4.6
Expand All @@ -55,3 +66,5 @@ if [ -z $SINGA_MANAGES_ZK ]; then
SINGA_MANAGES_ZK=false
fi

# mark that we have done all
SINGA_ENV_DONE=1
46 changes: 26 additions & 20 deletions bin/singa-run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,7 @@
# run a Singa job
#

usage="Usage: singa-run.sh -workspace=YOUR_WORKSPACE [ --resume ]\n
# workspace should contain job.conf\n
usage="Usage: singa-run.sh -conf=JOB_CONF [ --resume ]\n
# set --resume if want to recover a job\n
### NOTICE ###\n
# if you are using model.conf + cluster.conf,\n
Expand All @@ -33,28 +32,30 @@ usage="Usage: singa-run.sh -workspace=YOUR_WORKSPACE [ --resume ]\n

# check arguments
while [ $# != 0 ]; do
if [[ $1 == "-workspace="* ]]; then
workspace=$1
if [[ $1 == "-conf="* ]]; then
conf=$1
elif [ $1 == "--resume" ]; then
resume=1
else
echo -e $usage
exit 1
echo -e $usage && exit 1
fi
shift
done
if [ -z $workspace ]; then
if [ -z $conf ]; then
echo -e $usage
exit 1
fi

# get environment variables
. `dirname "${BASH_SOURCE-$0}"`/singa-env.sh
# get workspace path
workspace=`cd "${workspace:11}">/dev/null; pwd`
job_conf=$workspace/job.conf

# change conf to an absolute path
conf_dir=`dirname "${conf:6}"`
conf_dir=`cd "$conf_dir">/dev/null; pwd`
conf_base=`basename "${conf:6}"`
job_conf=$conf_dir/$conf_base
if [ ! -f $job_conf ]; then
echo job.conf not exists in $workspace
echo $job_conf not exists
exit 1
fi
cd $SINGA_HOME
Expand All @@ -64,20 +65,26 @@ if [ $SINGA_MANAGES_ZK = true ]; then
$SINGA_BIN/zk-service.sh start || exit 1
fi

# generate unique job id
job_id=`./singatool create`
[ $? == 0 ] || exit 1
echo Unique JOB_ID is $job_id

# generate job info dir
# format: job-JOB_ID-YYYYMMDD-HHMMSS
log_dir=$SINGA_LOG/job-info/job-$job_id-$(date '+%Y%m%d-%H%M%S');
mkdir -p $log_dir
echo Record job information to $log_dir

# generate host file
host_file=$workspace/job.hosts
host_file=$log_dir/job.hosts
python $SINGA_HOME/tool/gen_hosts.py -conf=$job_conf \
-hosts=$SINGA_CONF/hostfile \
-output=$host_file \
|| exit 1

# generate unique job id
./singatool create 1>$workspace/job.id || exit 1
job_id=`cat $workspace/job.id`
echo Generate job id to $workspace/job.id [job_id = $job_id]

# set command to run singa
singa_run="./singa -workspace=$workspace -job=$job_id"
singa_run="./singa -conf=$job_conf -job=$job_id"
if [ ! -z $resume ]; then
singa_run="$singa_run --resume"
fi
Expand All @@ -100,6 +107,5 @@ done

# generate pid list for this job
sleep 2
./singatool view $job_id 1>$workspace/job.pids || exit
echo Generate pid list to $workspace/job.pids
./singatool view $job_id 1>$log_dir/job.pids || exit 1
wait
2 changes: 1 addition & 1 deletion bin/singa-stop.sh
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ ssh_options="-oStrictHostKeyChecking=no \
-oUserKnownHostsFile=/dev/null \
-oLogLevel=quiet"
hosts=`cat $host_file | cut -d ' ' -f 1`
singa_kill="killall -s SIGKILL -r singa"
singa_kill="killall -q -s SIGKILL -r singa"
for i in ${hosts[@]}; do
echo Kill singa @ $i ...
if [ $i == localhost ]; then
Expand Down
1 change: 1 addition & 0 deletions examples/cifar10/job.conf
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
cluster {
nworker_groups: 1
nserver_groups: 1
workspace: "examples/cifar10"
}

model {
Expand Down
4 changes: 2 additions & 2 deletions include/utils/cluster_rt.h
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ class ZKService {
RTCallback *cb);

private:
const int kNumRetry = 10;
const int kNumRetry = 5;
const int kSleepSec = 1;

static void WatcherGlobal(zhandle_t* zh, int type, int state,
Expand Down Expand Up @@ -139,7 +139,7 @@ class JobManager {
JobManager(const std::string& host, int timeout);

bool Init();
int GenerateJobID();
bool GenerateJobID(int* id);
bool ListJobs(std::vector<JobInfo>* jobs);
bool ListJobProcs(int job, std::vector<std::string>* procs);
bool Clean(int job);
Expand Down
6 changes: 2 additions & 4 deletions src/main.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

DEFINE_int32(job, -1, "Unique job ID generated from singa-run.sh");
DEFINE_bool(resume, false, "Resume from checkpoint passed at cmd line");
DEFINE_string(workspace, "./workspace", "workspace passed at cmd line");
DEFINE_string(conf, "./job.conf", "job conf passed at cmd line");

/**
* Register layers, and other customizable classes.
Expand All @@ -31,12 +31,10 @@ int main(int argc, char **argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);

singa::JobProto jobConf;
std::string job_file = FLAGS_workspace + "/job.conf";
std::string job_file = FLAGS_conf;
singa::ReadProtoFromTextFile(job_file.c_str(), &jobConf);
CHECK(jobConf.has_cluster());
CHECK(jobConf.has_model());
if (!jobConf.cluster().has_workspace())
jobConf.mutable_cluster()->set_workspace(FLAGS_workspace);

RegisterClasses();
singa::SubmitJob(FLAGS_job, FLAGS_resume, jobConf);
Expand Down
2 changes: 1 addition & 1 deletion src/proto/job.proto
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ message ClusterProto {
// port number is used by ZeroMQ
optional int32 start_port = 13 [default = 6723];
// local workspace, train/val/test shards, checkpoint files
optional string workspace = 14 [default = "workspace"];
required string workspace = 14;

// conduct updates at server side; otherwise do it at worker side
optional bool server_update = 40 [default = true];
Expand Down
4 changes: 2 additions & 2 deletions src/proto/singa.proto
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ package singa;
message SingaProto {
// ip/hostname:port[,ip/hostname:port]
required string zookeeper_host = 1;
// if not set, use the default dir of glog
optional string log_dir = 2;
// log dir for singa binary and job information(job id, host list, pid list)
optional string log_dir = 2 [default = "/tmp/singa-log/"];
}
7 changes: 4 additions & 3 deletions src/utils/cluster_rt.cc
Original file line number Diff line number Diff line change
Expand Up @@ -297,14 +297,15 @@ bool JobManager::Init() {
return true;
}

int JobManager::GenerateJobID() {
bool JobManager::GenerateJobID(int* id) {
char buf[kZKBufSize];
string lock = kZKPathJLock + "/lock-";
if (!zk_.CreateNode(lock.c_str(), nullptr,
ZOO_EPHEMERAL | ZOO_SEQUENCE, buf)) {
return -1;
return false;
}
return atoi(buf+strlen(buf)-10);
*id = atoi(buf+strlen(buf)-10);
return true;
}

bool JobManager::ListJobProcs(int job, vector<string>* procs) {
Expand Down
Loading

0 comments on commit da844af

Please sign in to comment.