diff --git a/README.md b/README.md
index b4f551b..c8c21d1 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
# Hadoop and Yarn Setup
-## 1. set passwordless login
+### Set passwordless login
To create user
```
@@ -20,289 +20,78 @@ For other hosts
ssh-copy-id -i ~/.ssh/id_rsa.pub user@host
ssh user@host
```
-## 2. Download and install hadoop
-http://hadoop.apache.org/releases.html#Download
+### Pre-requisities:
+1. JAVA Setup should be completed and JAVA_HOME should be set in the ~/.bashrc file (environment variable).
+2. Make sure the nodes are set for password-less SSH both ways(master->slaves).
+3. Since we use the environment variables a lot in our scripts, make sure to comment out the portion following this statement in your ~/.bashrc ,
+`If not running interactively, don't do anything`. Update .bashrc
-```
-#Choose the right mirror, below link is for US machines.
-wget http://www-us.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz
-tar xf hadoop-2.7.3.tar.gz --gzip
-export HADOOP_HOME=$HOME/hadoop-2.7.3
-```
-
-## 3. Update slaves file
-
-Add data nodes, don't add master node.
-```bash
-vi $HADOOP_HOME/etc/hadoop/slaves
-user@host1
-user@host2
-```
-
-## 4. Hadoop utils setup
-```
-git clone https://github.com/kmadhugit/hadoop-cluster-utils.git
-cd hadoop-cluster-utils
-vi add-this-to-dot-profile.sh #update correct path to env variables.
-. add-this-to-dot-profile.sh
-```
-
-check whether cluster scripts are working
-
-```
-AN hostname
-```
-
-Update .bashrc
-
- 1. Delete the following check.
+ Delete/comment the following check.
```
# If not running interactively, don't do anything
-case $- in
- *i*) ;;
- *) return;;
-esac
+ case $- in
+ *i*) ;;
+ *) return;;
+ esac
```
-
- 2. Read add-this-to-dot-profile.sh at the end of .bashrc
+4. Install curl `sudo apt-get install curl` and install wget `sudo apt-get install wget`.
+5. Same username/useraccount should be need on `master` and `slaves` nodes for multinode installation.
- ```
- vi $HOME/.bashrc
- Gi
- :r $HOME/hadoop-cluster-utils/add-this-to-dot-profile.sh
- G
- set -o vi
- ```
-
- 3. copy .bashrc to all other data nodes
-
- ```
- CP $HOME/.bashrc $HOME
- ```
-
-
-## 5. Install Hadoop on all nodes
-```
-CP $HOME/hadoop-2.7.3.tar.gz $HOME
-DN "tar xf hadoop-2.7.3.tar.gz --gzip"
-```
-
-## 6. HDFS configuration
-
-You need to modify 2 config files for HDFS
+### Installations:
-1. core-site.xml #Modify the Hostname for the Name node
- ```
- cd $HOME/hadoop-cluster-utils/conf
- cp core-site.xml.template core-site.xml
- vi core-site.xml
- cp core-site.xml $HADOOP_HOME/etc/hadoop
- CP core-site.xml $HADOOP_HOME/etc/hadoop
- ```
-
-2. hdfs-site.xml
+* To automate hadoop installation follows the steps,
- create local dir in name node for meta-data (
+ ```bash
+ git clone https://github.com/kmadhugit/hadoop-cluster-utils.git
- ``` mkdir -p /data/user/hdfs-meta-data ```
+ cd hadoop-cluster-utils
+ ```
- create local dir in all data-nodes for hdfs-data
+* Configuration
+
+ 1. To configure `hadoop-cluster-utils`, run `./autogen.sh` which will create `config.sh` with appropriate field values.
+ 2. User can enter SLAVEIPs (if more than one, use comma seperated) interactively while running `./autogen.sh` file.
+ 3. Default `Spark-2.0.1` and `Hadoop-2.7.1` version available for installation.
+ 4. User can edit default port values, `spark` and `hadoop` versions in config.sh
+ 5. Before executing `./setup.sh` file, user can verify or edit `config.sh`
+ 6. Once setup script completed,source `~/.bashrc` file.
+
+* Ensure that the following java process is running in master. If not, check the log files
- ``` DN "mkdir -p /data/user/hdfs-data" ```
-
- update dir path
- ```
- cd $HOME/hadoop-cluster-utils/conf
- cp hdfs-site.xml.template hdfs-site.xml
- vi hdfs-site.xml #update dir path
+ ```bash
+ checkall.sh
```
- Copy the files to all nodes
- ```
- cp hdfs-site.xml $HADOOP_HOME/etc/hadoop
- CP hdfs-site.xml $HADOOP_HOME/etc/hadoop
- ```
-
-3. Start HDFS as fresh FS
-
- ```
-$HADOOP_PREFIX/bin/hdfs namenode -format mycluster
-start-hdfs.sh
-AN jps
-# use stop-hdfs.sh for stopping
- ```
+ Invoke `checkall.sh` ensure all services are started on the Master & slaves
-4. Start HDFS on existing cluster data
- You need to modify ownership to self to use already created data
-
- ```
- AN "sudo chown user:user /data/hdfs-meta-data"
- AN "sudo chown user:user /data/hdfs-data"
- start-hdfs.sh
- AN jps
- ```
-
- Ensure that the following java process is running in master. If not, check the log files
-
```
NameNode
- ```
- Ensure that the following java process is running in slaves. If not, check the log files
- ```
- DataNode
- ```
-
-5. HDFS web address
-
- ```
- http://localhost:50070
- ```
-
-## 7. Yarn configuration
-
-You need to modify 2 config files for HDFS
-
-1. capacity-scheduler.xml #Modify resource-calculator property to DominantResourceCalculator
-
- ```bash
- vi $HADOOP_HOME/etc/hadoop/capacity-scheduler.xml
- ```
- ```xml
-
- yarn.scheduler.capacity.resource-calculator
- org.apache.hadoop.yarn.util.resource.DominantResourceCalculator
-
- ```
-2. yarn-site.xml # Modify the properties as per the description provided in the template
-
- ```
- cd $HOME/hadoop-cluster-utils/conf
- cp yarn-site.xml.template yarn-site.xml
- vi yarn-site.xml
- cp yarn-site.xml $HADOOP_HOME/etc/hadoop
- CP yarn-site.xml $HADOOP_HOME/etc/hadoop
- AN jps
- ```
-
- Ensure that the following java process is started in master. If not, check the log files
-
- ```
JobHistoryServer
ResourceManager
```
- Ensure that the following java process is started in slaves. If not, check the log files
+ Ensure that the following java process is running in slaves. If not, check the hadoop log files
```
+ DataNode
NodeManager
```
-
-3. Start Yarn
- ```
- start-yarn.sh
- AN jps
- ```
-
-3. Resource Manager and Node Manager web Address
- ```
- Resource Manager : http://localhost:8088/cluster
- Node Manager : http://datanode:8042/node (For each node)
- ```
-
-## 8. Useful scripts
-
- ```
- > stop-all.sh #stop HDFS and Yarn
- > start-all.sh #start HDFS and Yarn
- > CP #Copy file from name nodes to all slaves
- > AN #execute a given command in all nodes including master
- > DN #execute a given command in all nodes excluding master
- ```
-
-## 9. Spark Installation.
-
-### a. Download Binary
-
-```
-http://spark.apache.org/downloads.html
-#Choose the right mirror, below link is for US machines.
-wget http://www-us.apache.org/dist/spark/spark-2.0.1/spark-2.0.1-bin-hadoop2.7.tgz
-tar -zvf spark-2.0.1-bin-hadoop2.7.tgz
-```
-
-### b. Build it yourself
-
-```
-git clone https://github.com/apache/spark.git
-git checkout -b v2.0.1 v2.0.1
-export MAVEN_OPTS="-Xmx32G -XX:MaxPermSize=8G -XX:ReservedCodeCacheSize=2G"
-./build/mvn -T40 -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -Phive -Phive-thriftserver -DskipTests -Dmaven.javadoc.skip=true install
-```
-
-### c. Test (pre-built spark version)
-```
-#Add in ~/.bashrc
-export SPARK_HOME=$HOME/spark-2.0.1-bin-hadoop2.7
-
-. ~/.bashrc
-
-${SPARK_HOME}bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1024M --num-executors 2 --executor-memory 1g --executor-cores 1 ${SPARK_HOME}/examples/jars/spark-examples_2.11-2.0.1.jar 10
-```
-
-### d. Test (manual spark build)
-
-```
-#Add in ~/.bashrc
-export SPARK_HOME=$HOME/spark
-
-. ~/.bashrc
-
-$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1024M --num-executors 2 --executor-memory 1g --executor-cores 1 /home/testuser/spark/examples/target/scala-2.11/jars/spark-examples_2.11-2.0.1.jar
-
-```
-
-### e. Enable EventLogging & additional settings by adding the following content to $SPARK_HOME/conf/spark-defaults.conf
-```
-spark.eventLog.enabled true
-spark.eventLog.dir /tmp/spark-events
-spark.eventLog.compress true
-spark.history.fs.logDirectory /tmp/spark-events
-spark.serializer org.apache.spark.serializer.KryoSerializer
-```
-
-### f. Start/Stop All Services.
-
- The below scripts are used to start/stop the following services in an automated way,
-
- - namenode daemon (only on hdfs master)
- - datanode daemon (on all slave nodes)
- - resource manager daemon (only on yarn master)
- - node manager daemon (on all slave nodes)
- - job history server (only on yarn master)
- - Spark history server (on yarn master)
-
-```
- # Start
- start-all.sh
+* HDFS, Resource Manager, Node Manager and Spark web Address
+
+ ```
+ HDFS web address : http://localhost:50070
+ Resource Manager : http://localhost:8088/cluster
+ Node Manager : http://datanode:8042/node (For each node)
+ Spark : http://localhost:8080 (Default)
+ ```
- # Stop
+* Useful scripts
- stop-all.sh
-=======
-```
-
-## 10. Spark command line options for Yarn Scheduler.
-
-
-| Option | Description |
-|--------|-------------|
-| --num-executors | Total number of executor JVMs to spawn across Yarn Cluster |
-| --executor-cores | Total number of cores in each executor JVM |
-| --executor-memory | Memory to be allocated for each JVM 1024M/1G|
-| --driver-memory | Memory to be allocated for driver JVM |
-| --driver-cores | Total number of vcores for driver JVM |
-| | Total vcores = num-executors * executor-vcores + driver-cores |
-| | Total Memory = num-executors * executor-memory + driver-memory |
-|--driver-java-options | To pass driver JVM, useful in local mode for profiling |
-
------------------------------------------------------------------
+ ```
+ > stop-all.sh #stop HDFS and Yarn
+ > start-all.sh #start HDFS and Yarn
+ > CP #Copy file from name nodes to all slaves
+ > AN #execute a given command in all nodes including master
+ > DN #execute a given command in all nodes excluding master
+ > checkall.sh #ensure all services are started on the Master & slaves
+ ```
diff --git a/add-this-to-dot-profile.sh b/add-this-to-dot-profile.sh
deleted file mode 100644
index 71402e2..0000000
--- a/add-this-to-dot-profile.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-
-export PATH=$HOME/hadoop-cluster-utils/utils:$HOME/hadoop-cluster-utils/hadoop:$PATH
-
-export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-ppc64el
-export HADOOP_HOME=$HOME/hadoop-2.7.3
-
-
-export HADOOP_PREFIX=$HADOOP_HOME
-export HADOOP_MAPRED_HOME=$HADOOP_HOME
-export HADOOP_COMMON_HOME=$HADOOP_HOME
-export HADOOP_HDFS_HOME=$HADOOP_HOME
-export YARN_HOME=$HADOOP_HOME
-export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop
-export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop
-
-export PATH=${HADOOP_HOME}/bin:$PATH
-set -o vi
-
-# Some convenient aliases and functions for running Hadoop-related commands
-unalias fs &> /dev/null
-alias fs="hadoop fs"
-unalias hls &> /dev/null
-alias hls="fs -ls"
-
diff --git a/autogen.sh b/autogen.sh
new file mode 100755
index 0000000..2519811
--- /dev/null
+++ b/autogen.sh
@@ -0,0 +1,89 @@
+#!/bin/bash -l
+
+
+# Creating new config.sh
+echo -en '# Default hdfs configuration properties\n' > config.sh
+echo -en 'HADOOP_TMP_DIR=/tmp/'"${USER}"'/app-hadoop\n' >> config.sh
+echo -en 'REPLICATION_VALUE=3\n' >> config.sh
+echo -en 'NAMENODE_DIR=/tmp/'"${USER}"'/hdfs-meta\n' >> config.sh
+echo -en 'DATANODE_DIR=/tmp/'"${USER}"'/hdfs-data\n\n' >> config.sh
+
+echo -en '# Master Details\n' >> config.sh
+MASTER=`host $HOSTNAME | cut -f4 -d " "`
+echo -en 'MASTER='$MASTER'\n\n' >> config.sh
+
+echo -en 'Please enter slave IP detail in format slave1IP,slave2IP \n'
+read SLAVEIP
+
+echo -en '# Using these format to save SLAVE Details: slave1IP,slave1cpu,slave1memory....\n' >> config.sh
+echo -e
+
+j=0
+for i in `echo $SLAVEIP |tr ',' ' '`
+do
+echo -en 'Collecting memory details from SLAVE machine '$i' \n'
+freememory=$(ssh $i free -m | awk '{print $4}'| head -2 | tail -1)
+memorypercent=$(awk "BEGIN { pc=80*$freememory/100; i=int(pc); print (pc-i<0.5)?i:i+1 }")
+ncpu=$(ssh $i nproc --all)
+if [ $j -eq 0 ]
+then
+SLAVE=`echo ''$i','$ncpu','$memorypercent''`
+else
+SLAVE=`echo ''$SLAVE'%'$i','$ncpu','$memorypercent''`
+fi
+((j=j+1))
+done
+
+echo -en 'SLAVES='$SLAVE'\n\n' >> config.sh
+
+echo -en '#Node Manager properties (Default yarn cpu and memory value for all nodes)\n' >> config.sh
+echo -en 'YARN_SCHEDULER_MIN_ALLOCATION_MB=128\n' >> config.sh
+echo -en 'YARN_SCHEDULER_MIN_ALLOCATION_VCORES=1\n\n' >> config.sh
+echo -e
+echo -en 'Default Spark version : 2.0.1\n'
+sparkver="2.0.1"
+echo -en 'Default hadoop version : 2.7.1\n'
+hadoopver="2.7.1"
+
+echo -en '#Hadoop and Spark versions and setup zip download urls\n' >> config.sh
+echo -e
+echo -en 'sparkver='"$sparkver"'\n' >> config.sh
+echo -en 'hadoopver='"$hadoopver"'\n\n' >> config.sh
+
+HADOOP_URL="http://www-us.apache.org/dist/hadoop/common/hadoop-${hadoopver}/hadoop-${hadoopver}.tar.gz"
+SPARK_URL="http://www-us.apache.org/dist/spark/spark-${sparkver}/spark-${sparkver}-bin-hadoop${hadoopver:0:3}.tgz"
+
+echo -en 'SPARK_URL='$SPARK_URL'\n' >> config.sh
+echo -en 'HADOOP_URL='$HADOOP_URL'\n\n' >> config.sh
+
+
+echo -en '# Default port values\n' >> config.sh
+
+echo -en 'NAMENODE_PORT=9000\n' >> config.sh
+echo -en 'NAMENODE_HTTP_ADDRESS=50070\n' >> config.sh
+echo -en 'NAMENODE_SECONDARY_HTTP_ADDRESS=50090\n' >> config.sh
+echo -en 'NAMENODE_SECONDARY_HTTPS_ADDRESS=50091\n\n' >> config.sh
+
+echo -en 'DATANODE_ADDRESS=50010\n' >> config.sh
+echo -en 'DATANODE_HTTP_ADDRESS=50075\n' >> config.sh
+echo -en 'DATANODE_IPC_ADDRESS=50020\n\n' >> config.sh
+
+echo -en 'MAPREDUCE_JOBHISTORY_ADDRESS=10020\n' >> config.sh
+echo -en 'MAPREDUCE_JOBHISTORY_ADMIN_ADDRESS=10039\n' >> config.sh
+echo -en 'MAPREDUCE_JOBHISTORY_WEBAPP_ADDRESS=19883\n\n' >> config.sh
+
+echo -en 'RESOURCEMANAGER_SCHEDULER_ADDRESS=8034\n' >> config.sh
+echo -en 'RESOURCEMANAGER_RESOURCE_TRACKER_ADDRESS=8039\n' >> config.sh
+echo -en 'RESOURCEMANAGER_ADDRESS=8038\n' >> config.sh
+echo -en 'RESOURCEMANAGER_ADMIN_ADDRESS=8033\n' >> config.sh
+echo -en 'RESOURCEMANAGER_WEBAPP_ADDRESS=8089\n\n' >> config.sh
+
+echo -en 'NODEMANAGER_LOCALIZER_ADDRESS=8043\n' >> config.sh
+echo -en 'NODEMANAGER_WEBAPP_ADDRESS=8045\n\n' >> config.sh
+echo -en 'SPARKHISTORY_HTTP_ADDRESS=18080\n\n' >> config.sh
+
+echo -e 'Please check configuration (config.sh file) once before run (setup.sh file).'
+echo -e 'You can modify hadoop or spark versions in config.sh file'
+echo -e
+chmod +x config.sh
+
diff --git a/conf/core-site.xml.template b/conf/core-site.xml.template
index 64262ec..af9ffb2 100644
--- a/conf/core-site.xml.template
+++ b/conf/core-site.xml.template
@@ -17,13 +17,16 @@
-
- fs.defaultFS
- hdfs://namenode
- Name node URL
-
-
- hadoop.tmp.dir
- file:/spark1/data/baidu/tmp
-
-
+
+
+ hadoop.tmp.dir
+ HADOOP_TMP_DIR
+
+
+
+ fs.defaultFS
+ hdfs://MASTER:NAMENODE_PORT
+ Name node URL
+
+
+
\ No newline at end of file
diff --git a/conf/hdfs-site.xml.template b/conf/hdfs-site.xml.template
index 70bdf64..7281241 100644
--- a/conf/hdfs-site.xml.template
+++ b/conf/hdfs-site.xml.template
@@ -18,21 +18,59 @@
-
-dfs.replication
-3
-
-
-
-dfs.namenode.name.dir
-file:/data/madhu/hdfs-meta-data
-Meta data dir - can be RAM FS only on Namename
-
-
-
-dfs.datanode.data.dir
-file:/data/madhu/hdfs-data
-Data dir - on all data nodes
-
+
+ dfs.replication
+ REPLICATION_VALUE
+
+
+
+
+ dfs.namenode.name.dir
+ NAMENODE_DIR
+ Meta data dir - can be RAM FS only on Namename
+
+
+
+ dfs.namenode.http-address
+ 0.0.0.0:NAMENODE_HTTP_ADDRESS
+ The address and the base port where the dfs namenode web ui will listen on.
+
+
+
+ dfs.namenode.secondary.http-address
+ 0.0.0.0:NAMENODE_SECONDARY_HTTP_ADDRESS
+ The secondary namenode http server address and port.
+
+
+
+ dfs.namenode.secondary.https-address
+ 0.0.0.0:NAMENODE_SECONDARY_HTTPS_ADDRESS
+ The secondary namenode HTTPS server address and port.
+
+
+
+
+ dfs.datanode.data.dir
+ DATANODE_DIR
+ Data dir - on all data nodes
+
+
+
+ dfs.datanode.address
+ 0.0.0.0:DATANODE_ADDRESS
+ The datanode server address and port for data transfer.
+
+
+
+ dfs.datanode.http.address
+ 0.0.0.0:DATANODE_HTTP_ADDRESS
+ The datanode http server address and port.
+
+
+
+ dfs.datanode.ipc.address
+ 0.0.0.0:DATANODE_IPC_ADDRESS
+ The datanode ipc server address and port.
+
diff --git a/conf/mapred-site.xml.template b/conf/mapred-site.xml.template
new file mode 100644
index 0000000..1bf8576
--- /dev/null
+++ b/conf/mapred-site.xml.template
@@ -0,0 +1,39 @@
+
+
+
+
+
+
+
+
+
+ mapreduce.jobhistory.address
+ 0.0.0.0:MAPREDUCE_JOBHISTORY_ADDRESS
+ MapReduce JobHistory Server IPC host:port
+
+
+
+ mapreduce.jobhistory.admin.address
+ 0.0.0.0:MAPREDUCE_JOBHISTORY_ADMIN_ADDRESS
+ The address of the History server admin interface.
+
+
+
+ mapreduce.jobhistory.webapp.address
+ 0.0.0.0:MAPREDUCE_JOBHISTORY_WEBAPP_ADDRESS
+ MapReduce JobHistory Server Web UI host:port
+
+
+
diff --git a/conf/yarn-site.xml.template b/conf/yarn-site.xml.template
index 37f6325..2bf8c80 100644
--- a/conf/yarn-site.xml.template
+++ b/conf/yarn-site.xml.template
@@ -16,55 +16,98 @@
-
- yarn.resourcemanager.hostname
- n001
-
-
- yarn.resourcemanager.webapp.address
- 0.0.0.0:8088
-
+
+ yarn.resourcemanager.hostname
+ MASTER
+
+
+
+ yarn.resourcemanager.scheduler.address
+ 0.0.0.0:RESOURCEMANAGER_SCHEDULER_ADDRESS
+ The address of the scheduler interface.
+
+
+
+ yarn.resourcemanager.resource-tracker.address
+ 0.0.0.0:RESOURCEMANAGER_RESOURCE_TRACKER_ADDRESS
+
+
+
+ yarn.resourcemanager.address
+ 0.0.0.0:RESOURCEMANAGER_ADDRESS
+ The address of the RM web application.
+
+
+
+ yarn.resourcemanager.admin.address
+ 0.0.0.0:RESOURCEMANAGER_ADMIN_ADDRESS
+ The address of the RM admin interface.
+
+
+
+ yarn.resourcemanager.webapp.address
+ 0.0.0.0:RESOURCEMANAGER_WEBAPP_ADDRESS
+ The address of the applications manager interface in the RM.
+
-
- yarn.scheduler.minimum-allocation-mb
- 128
- Min value for --executor-memory
-
-
- yarn.scheduler.maximum-allocation-mb
- 204800
- Max value for --executor-memory
-
-
- yarn.scheduler.minimum-allocation-vcores
- 1
- Min value for —executor-vcore
-
-
- yarn.scheduler.maximum-allocation-vcores
- 40
- Max value for —executor-vcore
-
+
+ yarn.scheduler.minimum-allocation-mb
+ YARN_SCHEDULER_MIN_ALLOCATION_MB
+ Min value for --executor-memory
+
+
+
+ yarn.scheduler.maximum-allocation-mb
+ YARN_SCHEDULER_MAX_ALLOCATION_MB
+ Max value for --executor-memory
+
+
+
+ yarn.scheduler.minimum-allocation-vcores
+ YARN_SCHEDULER_MIN_ALLOCATION_VCORES
+ Min value for —executor-vcore
+
+
+
+ yarn.scheduler.maximum-allocation-vcores
+ YARN_SCHEDULER_MAX_ALLOCATION_VCORES
+ Max value for —executor-vcore
+
-
- yarn.nodemanager.resource.cpu-vcores
- 160
- Vcore capacity of this node
-
-
- yarn.nodemanager.resource.memory-mb
- 204800
- Memory Capacity of this node
-
-
- yarn.nodemanager.vmem-check-enabled
- false
-
-
- yarn.nodemanager.pmem-check-enabled
- false
-
+
+ yarn.nodemanager.resource.cpu-vcores
+ YARN_NODEMANAGER_RESOURCE_CPU_VCORES
+ Vcore capacity of this node
+
+
+
+ yarn.nodemanager.resource.memory-mb
+ YARN_NODEMANAGER_RESOURCE_MEMORY_MB
+ Memory Capacity of this node
+
+
+
+ yarn.nodemanager.vmem-check-enabled
+ false
+
+
+
+ yarn.nodemanager.pmem-check-enabled
+ false
+
+
+
+ yarn.nodemanager.localizer.address
+ 0.0.0.0:NODEMANAGER_LOCALIZER_ADDRESS
+ Address where the localizer IPC is.
+
+
+
+ yarn.nodemanager.webapp.address
+ 0.0.0.0:NODEMANAGER_WEBAPP_ADDRESS
+ NM Webapp address.
+
-
+
\ No newline at end of file
diff --git a/hadoop/start-all.sh b/hadoop/start-all.sh
index f9d66a5..c146d6f 100755
--- a/hadoop/start-all.sh
+++ b/hadoop/start-all.sh
@@ -6,3 +6,6 @@ $HADOOP_HOME/sbin/yarn-daemons.sh start nodemanager
$HADOOP_HOME/sbin/mr-jobhistory-daemon.sh start historyserver
$SPARK_HOME/sbin/start-history-server.sh
+
+DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )"
+${DIR}/../utils/checkall.sh
\ No newline at end of file
diff --git a/setup.sh b/setup.sh
new file mode 100755
index 0000000..dc021a1
--- /dev/null
+++ b/setup.sh
@@ -0,0 +1,434 @@
+#!/bin/bash -l
+
+# Need to create user manually
+# Need to set JAVA_HOME in .bashrc files on all machines
+# Need to complete ssh setup for all servers
+
+ul=`tput smul`
+nul=`tput rmul`
+
+CURDIR=`pwd` # Inside hadoop-cluster-utils directory where run.sh is exist
+WORKDIR=${HOME} # where hadoop and spark package will download
+
+current_time=$(date +"%Y.%m.%d.%S")
+
+if [ ! -d $CURDIR/logs ];
+then
+ mkdir logs
+fi
+
+log=`pwd`/logs/hadoop_cluster_utils_$current_time.log
+echo -e | tee -a $log
+if [[ -n "$JAVA_HOME" ]] && [[ -x "$JAVA_HOME/bin/java" ]]
+then
+ echo JAVA_HOME found on MASTER, java executable in $JAVA_HOME | tee $log
+ echo "---------------------------------------------" | tee -a $log
+else
+ echo "JAVA_HOME not found in your environment, please set the JAVA_HOME variable in your environment then continue to run this script." | tee -a $log
+ exit 1
+fi
+
+grep '#case $- in' $HOME/.bashrc &>>/dev/null
+ if [ $? -ne 0 ]
+then
+ echo 'Prerequisite not completed. Please comment below lines in .bashrc file' | tee -a $log
+ echo "# If not running interactively, don't do anything" | tee -a $log
+ echo "case \$- in" | tee -a $log
+ echo "*i*) ;;" | tee -a $log
+ echo "*) return;;" | tee -a $log
+ echo "esac" | tee -a $log
+ exit 1
+fi
+
+## Validation for config file
+
+if [ -f ${CURDIR}/config.sh ];
+then
+ ## First time permission set for config.sh file
+ chmod +x config.sh
+ source config.sh
+
+ ## Checking config file for all required fields
+
+ { cat ${CURDIR}/config.sh; echo; } | while read -r line; do
+ if [[ $line =~ "=" ]] ;
+ then
+ confvalue=`echo $line |grep = | cut -d "=" -f2`
+ if [[ -z "$confvalue" ]];
+ then
+ echo "Configuration vlaue not set properly for $line, please check config.sh file" | tee -a $log
+ exit 1
+ fi
+ fi
+ done
+
+ ## Validation for hadoop port instances
+
+ declare -a port_name=("NAMENODE_PORT" "NAMENODE_HTTP_ADDRESS" "NAMENODE_SECONDARY_HTTP_ADDRESS" "NAMENODE_SECONDARY_HTTPS_ADDRESS" "DATANODE_ADDRESS" "DATANODE_HTTP_ADDRESS" "DATANODE_IPC_ADDRESS" "MAPREDUCE_JOBHISTORY_ADDRESS" "MAPREDUCE_JOBHISTORY_ADMIN_ADDRESS" "MAPREDUCE_JOBHISTORY_WEBAPP_ADDRESS" "RESOURCEMANAGER_SCHEDULER_ADDRESS" "RESOURCEMANAGER_RESOURCE_TRACKER_ADDRESS" "RESOURCEMANAGER_ADDRESS" "RESOURCEMANAGER_ADMIN_ADDRESS" "RESOURCEMANAGER_WEBAPP_ADDRESS" "NODEMANAGER_LOCALIZER_ADDRESS" "NODEMANAGER_WEBAPP_ADDRESS" "SPARKHISTORY_HTTP_ADDRESS")
+
+ declare -a port_list=("$NAMENODE_PORT" "$NAMENODE_HTTP_ADDRESS" "$NAMENODE_SECONDARY_HTTP_ADDRESS" "$NAMENODE_SECONDARY_HTTPS_ADDRESS" "$DATANODE_ADDRESS" "$DATANODE_HTTP_ADDRESS" "$DATANODE_IPC_ADDRESS" "$MAPREDUCE_JOBHISTORY_ADDRESS" "$MAPREDUCE_JOBHISTORY_ADMIN_ADDRESS" "$MAPREDUCE_JOBHISTORY_WEBAPP_ADDRESS" "$RESOURCEMANAGER_SCHEDULER_ADDRESS" "$RESOURCEMANAGER_RESOURCE_TRACKER_ADDRESS" "$RESOURCEMANAGER_ADDRESS" "$RESOURCEMANAGER_ADMIN_ADDRESS" "$RESOURCEMANAGER_WEBAPP_ADDRESS" "$NODEMANAGER_LOCALIZER_ADDRESS" "$NODEMANAGER_WEBAPP_ADDRESS" "$SPARKHISTORY_HTTP_ADDRESS")
+
+ i=0
+ for j in "${port_list[@]}";
+ do
+ sudo netstat -pnlt | grep $j > /dev/null
+ if [ $? -eq 0 ];
+ then
+ echo "${port_name[i]} running on port $j" >> temp
+ fi
+ i=$i+1
+ done
+
+ if [ -f temp ];
+ then
+ cat temp
+ cat temp >> $log
+ echo "Kindly kill above running instance(s) else change port number in config.sh file, then continue to run this script." | tee -a $log
+ rm temp &>/dev/null
+ exit 1
+ fi
+
+ ## Adding slave machine names to slave file
+ cat ${CURDIR}/config.sh | grep SLAVES | grep -v "^#" |cut -d "=" -f2 | tr "%" "\n" | cut -d "," -f1 >${CURDIR}/conf/slaves
+
+
+
+ SLAVES=`cat ${CURDIR}/config.sh | grep SLAVES | grep -v "^#" |cut -d "=" -f2`
+
+ cat ${CURDIR}/config.sh | grep SLAVES | grep -v "^#" | tr "%" "\n" | grep -E ''$MASTER'|'$HOSTNAME'' &>>/dev/null
+ if [ $? -eq 0 ]
+ then
+ #if master is also used as data machine
+ SERVERS=$SLAVES
+ else
+ ## Getting details for Master machine
+
+ freememory_master="$(free -m | awk '{print $4}'| head -2 | tail -1)"
+ memorypercent_master=$(awk "BEGIN { pc=80*${freememory_master}/100; i=int(pc); print (pc-i<0.5)?i:i+1 }")
+ ncpu_master="$(nproc --all)"
+ MASTER_DETAILS=''$HOSTNAME','$ncpu_master','$memorypercent_master''
+ SERVERS=`echo ''$MASTER_DETAILS'%'$SLAVES''`
+ fi
+
+ ## Validation for Slaves IPs
+ echo -e "${ul}Validation for slave IPs${nul}" | tee -a $log
+ while IFS= read -r ip; do
+ if ping -q -c2 "$ip" &>/dev/null;
+ then
+ echo "$ip is Pingable" | tee -a $log
+ else
+ echo "$ip Not Pingable" | tee -a $log
+ echo 'Please check your config.sh file. '$ip' is not pingalbe. \n' | tee -a $log
+ exit 1
+ fi
+ done <${CURDIR}/conf/slaves
+
+
+ ## Download and install hadoop For Master machine installation
+
+ echo "---------------------------------------------" | tee -a $log
+ echo "${ul}Downloading and installing hadoop...${nul}" | tee -a $log
+ echo -e | tee -a $log
+ cd ${WORKDIR}
+ if [ ! -f ${WORKDIR}/hadoop-${hadoopver}.tar.gz ];
+ then
+ if curl --output /dev/null --silent --head --fail $HADOOP_URL
+ then
+ echo 'Hadoop file Downloading on Master- '$MASTER'' | tee -a $log
+ wget $HADOOP_URL | tee -a $log
+ else
+ echo "This URL Not Exist. Please check your hadoop version then continue to run this script." | tee -a $log
+ exit 1
+ fi
+ fi
+
+
+ ## Copying hadoop tgz file , unzipping and exporting paths in the .bashrc file on all machines
+
+ for i in `echo $SERVERS |cut -d "=" -f2 | tr "%" "\n" | cut -d "," -f1`
+ do
+
+ if [ $i != $MASTER ]
+ then
+ echo 'Copying Hadoop setup file on '$i'' | tee -a $log
+ scp ${WORKDIR}/hadoop-${hadoopver}.tar.gz @$i:${WORKDIR} | tee -a $log
+ fi
+ echo 'Unzipping Hadoop setup file on '$i'' | tee -a $log
+ ssh $i "tar xf hadoop-${hadoopver}.tar.gz --gzip"
+
+ echo 'Updating hadoop variables on '$i'' | tee -a $log
+
+ export HADOOP_HOME="${WORKDIR}"/hadoop-${hadoopver}
+ echo "#StartHadoopEnv"> tmp_b
+ echo "export CURDIR="${CURDIR}"" >> tmp_b
+ echo "export PATH="${CURDIR}":"${CURDIR}"/hadoop:\$PATH" >> tmp_b
+ echo "export PATH="${CURDIR}":"${CURDIR}"/utils:\$PATH" >> tmp_b
+ echo "export HADOOP_HOME="${WORKDIR}"/hadoop-${hadoopver}" >> tmp_b
+ echo "export HADOOP_PREFIX=$HADOOP_HOME" >> tmp_b
+ echo "export HADOOP_MAPRED_HOME=$HADOOP_HOME" >> tmp_b
+ echo "export HADOOP_COMMON_HOME=$HADOOP_HOME" >> tmp_b
+ echo "export HADOOP_HDFS_HOME=$HADOOP_HOME" >> tmp_b
+ echo "export YARN_HOME=$HADOOP_HOME" >> tmp_b
+ echo "export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop" >> tmp_b
+ echo "export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop" >> tmp_b
+ echo "export PATH=$HADOOP_HOME/bin:\$PATH" >> tmp_b
+ echo "#StopHadoopEnv">> tmp_b
+
+ scp tmp_b @$i:${WORKDIR} &>>/dev/null
+
+ ssh $i "grep -q '#StartHadoopEnv' $HOME/.bashrc"
+ if [ $? -ne 0 ];
+ then
+ ssh $i "cat tmp_b>>$HOME/.bashrc"
+ ssh $i "rm tmp_b"
+ else
+ ssh $i "sed -i '/#StartHadoopEnv/,/#StopHadoopEnv/d' $HOME/.bashrc"
+ ssh $i "cat tmp_b>>$HOME/.bashrc"
+ ssh $i "rm tmp_b"
+ fi
+ echo 'Sourcing updated .bashrc file on '$i'' | tee -a $log
+ ssh $i "source ~/.bashrc" &>>/dev/null
+ echo "---------------------------------------------" | tee -a $log
+ done
+ rm -rf tmp_b
+
+
+ ## Configuration changes in hadoop-clusterfor Core-site,hdfs-site and mapred-site xml
+
+ echo 'Updating configuration properties in hadoop-cluster CURDIR for Core-site,hdfs-site and mapred-site xml ' | tee -a $log
+
+ if [ ! -f ${CURDIR}/conf/core-site.xml ];
+ then
+ #Copying xml templates for editing
+ cp ${CURDIR}/conf/core-site.xml.template ${CURDIR}/conf/core-site.xml
+ cp ${CURDIR}/conf/hdfs-site.xml.template ${CURDIR}/conf/hdfs-site.xml
+ cp ${CURDIR}/conf/mapred-site.xml.template ${CURDIR}/conf/mapred-site.xml
+
+
+ #core-site.xml configuration configuration properties
+ sed -i 's|HADOOP_TMP_DIR|'"$HADOOP_TMP_DIR"'|g' ${CURDIR}/conf/core-site.xml
+ sed -i 's|MASTER|'"$MASTER"'|g' ${CURDIR}/conf/core-site.xml
+ sed -i 's|NAMENODE_PORT|'"$NAMENODE_PORT"'|g' ${CURDIR}/conf/core-site.xml
+
+
+ # hdfs-site.xml configuration properties
+ sed -i 's|REPLICATION_VALUE|'"$REPLICATION_VALUE"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|NAMENODE_DIR|'"$NAMENODE_DIR"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|DATANODE_DIR|'"$DATANODE_DIR"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|NAMENODE_HTTP_ADDRESS|'"$NAMENODE_HTTP_ADDRESS"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|NAMENODE_SECONDARY_HTTP_ADDRESS|'"$NAMENODE_SECONDARY_HTTP_ADDRESS"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|NAMENODE_SECONDARY_HTTPS_ADDRESS|'"$NAMENODE_SECONDARY_HTTPS_ADDRESS"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|DATANODE_ADDRESS|'"$DATANODE_ADDRESS"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|DATANODE_HTTP_ADDRESS|'"$DATANODE_HTTP_ADDRESS"'|g' ${CURDIR}/conf/hdfs-site.xml
+ sed -i 's|DATANODE_IPC_ADDRESS|'"$DATANODE_IPC_ADDRESS"'|g' ${CURDIR}/conf/hdfs-site.xml
+
+
+ # mapred-site.xml configuration properties
+ sed -i 's|MAPREDUCE_JOBHISTORY_ADDRESS|'"$MAPREDUCE_JOBHISTORY_ADDRESS"'|g' ${CURDIR}/conf/mapred-site.xml
+ sed -i 's|MAPREDUCE_JOBHISTORY_ADMIN_ADDRESS|'"$MAPREDUCE_JOBHISTORY_ADMIN_ADDRESS"'|g' ${CURDIR}/conf/mapred-site.xml
+ sed -i 's|MAPREDUCE_JOBHISTORY_WEBAPP_ADDRESS|'"$MAPREDUCE_JOBHISTORY_WEBAPP_ADDRESS"'|g' ${CURDIR}/conf/mapred-site.xml
+
+ fi
+
+
+ echo "---------------------------------------------" | tee -a $log
+
+ ## yarn-site.xml configuration properties and hadoop-env.sh file updates for all machines
+
+ for i in `echo $SERVERS |cut -d "=" -f2 | tr "%" "\n" `
+ do
+
+ memorypercent=`echo $i| cut -d "," -f3`
+ ncpu=`echo $i| cut -d "," -f2`
+ slaveip=`echo $i| cut -d "," -f1`
+
+ echo 'Updating configuration properties for yarn-sites and hadoop.env.sh for '$slaveip'' | tee -a $log
+
+ cp ${CURDIR}/conf/yarn-site.xml.template ${CURDIR}/conf/yarn-site.xml
+
+ sed -i 's|MASTER|'"$MASTER"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|YARN_SCHEDULER_MIN_ALLOCATION_MB|'"$YARN_SCHEDULER_MIN_ALLOCATION_MB"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|YARN_SCHEDULER_MAX_ALLOCATION_MB|'"$memorypercent"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|YARN_SCHEDULER_MIN_ALLOCATION_VCORES|'"$YARN_SCHEDULER_MIN_ALLOCATION_VCORES"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|YARN_SCHEDULER_MAX_ALLOCATION_VCORES|'"$ncpu"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|YARN_NODEMANAGER_RESOURCE_CPU_VCORES|'"$ncpu"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|YARN_NODEMANAGER_RESOURCE_MEMORY_MB|'"$memorypercent"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|RESOURCEMANAGER_SCHEDULER_ADDRESS|'"$RESOURCEMANAGER_SCHEDULER_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|RESOURCEMANAGER_RESOURCE_TRACKER_ADDRESS|'"$RESOURCEMANAGER_RESOURCE_TRACKER_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|RESOURCEMANAGER_ADDRESS|'"$RESOURCEMANAGER_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|RESOURCEMANAGER_ADMIN_ADDRESS|'"$RESOURCEMANAGER_ADMIN_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|RESOURCEMANAGER_WEBAPP_ADDRESS|'"$RESOURCEMANAGER_WEBAPP_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|NODEMANAGER_LOCALIZER_ADDRESS|'"$NODEMANAGER_LOCALIZER_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+ sed -i 's|NODEMANAGER_WEBAPP_ADDRESS|'"$NODEMANAGER_WEBAPP_ADDRESS"'|g' ${CURDIR}/conf/yarn-site.xml
+
+
+ scp ${CURDIR}/conf/*site.xml @$slaveip:$HADOOP_HOME/etc/hadoop | tee -a $log
+
+ ## Updating java version in hadoop-env.sh file on all machines
+
+ JAVA_HOME_SLAVE=$(ssh $slaveip 'grep JAVA_HOME ~/.bashrc | grep -v "PATH" | cut -d"=" -f2')
+ echo "sed -i 's|"\${JAVA_HOME}"|"${JAVA_HOME_SLAVE}"|g' $HADOOP_HOME/etc/hadoop/hadoop-env.sh" | ssh $slaveip bash
+ echo "---------------------------------------------" | tee -a $log
+
+ done
+ rm -rf ${CURDIR}/conf/*site.xml
+
+ ##Updating the slave file on master
+
+ cp ${CURDIR}/conf/slaves ${HADOOP_HOME}/etc/hadoop
+
+else
+ echo "Config file does not exist. Please check README.md for installation steps." | tee -a $log
+ exit 1
+fi
+
+##Spark installation
+
+echo "${ul}Downloading and installing Spark...${nul}" | tee -a $log
+
+cd ${WORKDIR}
+
+if [ ! -f ${WORKDIR}/spark-${sparkver}-bin-hadoop${hadoopver:0:3}.tgz ];
+then
+ if curl --output /dev/null --silent --head --fail $SPARK_URL
+ then
+ echo 'SPARK file Downloading on Master - '$MASTER'' | tee -a $log
+ wget $SPARK_URL | tee -a $log
+ else
+ echo "This URL Not Exist. Please check your spark version then continue to run this script." | tee -a $log
+ exit 1
+ fi
+echo "***********************************************"
+fi
+
+## Exporting SPARK_HOME to the PATH and Add scripts to the PATH
+
+for i in `echo $SERVERS |cut -d "=" -f2 | tr "%" "\n" | cut -d "," -f1`
+do
+ if [ $i != $MASTER ]
+ then
+ echo 'Copying Spark setup file on '$i'' | tee -a $log
+ scp ${WORKDIR}/spark-${sparkver}-bin-hadoop${hadoopver:0:3}.tgz @$i:${WORKDIR} | tee -a $log
+ fi
+ echo 'Unzipping Spark setup file on '$i'' | tee -a $log
+ ssh $i "tar xf spark*.tgz --gzip" | tee -a $log
+
+ echo 'Updating .bashrc file on '$i' with Spark variables '
+ echo '#StartSparkEnv' >tmp_b
+ echo "export SPARK_HOME="${WORKDIR}"/spark-"${sparkver}"-bin-hadoop"${hadoopver:0:3}"" >>tmp_b
+ echo "export PATH=\$SPARK_HOME/bin:\$PATH">>tmp_b
+ echo '#StopSparkEnv'>>tmp_b
+
+ scp tmp_b @$i:${WORKDIR}&>>/dev/null
+
+ ssh $i "grep -q "SPARK_HOME" ~/.bashrc"
+ if [ $? -ne 0 ];
+ then
+ ssh $i "cat tmp_b>>$HOME/.bashrc"
+ ssh $i "rm tmp_b"
+ else
+ ssh $i "sed -i '/#StartSparkEnv/,/#StopSparkEnv/ d' $HOME/.bashrc"
+ ssh $i "cat tmp_b>>$HOME/.bashrc"
+ ssh $i "rm tmp_b"
+ fi
+
+ ssh $i "source $HOME/.bashrc"
+
+done
+rm -rf tmp_b
+echo "---------------------------------------------" | tee -a $log
+
+## updating Slave file for Spark folder
+echo 'Updating Slave file for Spark setup'| tee -a $log
+
+cp spark-${sparkver}-bin-hadoop${hadoopver:0:3}/conf/slaves.template spark-${sparkver}-bin-hadoop${hadoopver:0:3}/conf/slaves
+sed -i 's|localhost||g' spark-${sparkver}-bin-hadoop${hadoopver:0:3}/conf/slaves
+cat ${CURDIR}/conf/slaves>>spark-${sparkver}-bin-hadoop${hadoopver:0:3}/conf/slaves
+
+echo -e "Configuring Spark history server" | tee -a $log
+
+cp $SPARK_HOME/conf/spark-defaults.conf.template $SPARK_HOME/conf/spark-defaults.conf
+grep -q "#StartSparkconf" $SPARK_HOME/conf/spark-defaults.conf
+if [ $? -ne 0 ];
+then
+ echo "#StartSparkconf" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.eventLog.enabled true" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.eventLog.dir /tmp/spark-events" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.eventLog.compress true" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.history.fs.logDirectory /tmp/spark-events-history" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "#StopSparkconf">> $SPARK_HOME/conf/spark-defaults.conf
+else
+ sed -i '/#StartSparkconf/,/#StopSparkconf/ d' $SPARK_HOME/conf/spark-defaults.conf
+ echo "#StartSparkconf" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.eventLog.enabled true" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.eventLog.dir /tmp/spark-events" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.eventLog.compress true" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "spark.history.fs.logDirectory /tmp/spark-events-history" >> $SPARK_HOME/conf/spark-defaults.conf
+ echo "#StopSparkconf">> $SPARK_HOME/conf/spark-defaults.conf
+fi
+
+CP $SPARK_HOME/conf/spark-defaults.conf $SPARK_HOME/conf &>/dev/null
+
+echo -e "Spark installation done..!!\n" | tee -a $log
+
+
+source ${HOME}/.bashrc
+
+##to start hadoop setup
+
+if [ ! -d "$HADOOP_TMP_DIR" ]
+then
+ # Creating directories
+ AN "mkdir -p $HADOOP_TMP_DIR" &>/dev/null
+ AN "mkdir -p $DFS_NAMENODE_NAME_DIR" &>/dev/null
+ AN "mkdir -p $DFS_DATANODE_NAME_DIR" &>/dev/null
+ AN "mkdir -p /tmp/spark-events" &>/dev/null
+ AN "mkdir -p /tmp/spark-events-history" &>/dev/null
+ echo "Finished creating directories"
+fi
+
+echo 'Formatting NAMENODE'| tee -a $log
+
+$HADOOP_PREFIX/bin/hdfs namenode -format mycluster >> $log
+echo -e | tee -a $log
+$CURDIR/hadoop/start-all.sh | tee -a $log
+echo -e | tee -a $log
+$CURDIR/utils/checkall.sh | tee -a $log
+
+## use stop-all.sh for stopping
+
+echo -e | tee -a $log
+echo "${ul}Web URL link${nul}" | tee -a $log
+echo "HDFS web address : http://"$MASTER":"$NAMENODE_HTTP_ADDRESS"" | tee -a $log
+echo "Resource Manager : http://"$MASTER":"$RESOURCEMANAGER_WEBAPP_ADDRESS"/cluster" | tee -a $log
+echo "SPARK history server : http://"$MASTER":"$SPARKHISTORY_HTTP_ADDRESS"" | tee -a $log
+echo -e | tee -a $log
+
+echo "---------------------------------------------" | tee -a $log
+echo "${ul}Ensure SPARK running correctly using following command${nul}" | tee -a $log
+echo "${SPARK_HOME}/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1024M --num-executors 2 --executor-memory 1g --executor-cores 1 ${SPARK_HOME}/examples/jars/spark-examples_2.11-2.0.1.jar 10" | tee -a $log
+echo -e
+read -p "Do you wish to run above command ? [y/N] " prompt
+
+
+if [[ $prompt == "y" || $prompt == "Y" || $prompt == "yes" || $prompt == "Yes" ]]
+then
+ ${SPARK_HOME}/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1024M --num-executors 2 --executor-memory 1g --executor-cores 1 ${SPARK_HOME}/examples/jars/spark-examples_2.11-2.0.1.jar 10 &>> $log
+
+else
+ echo "Thanks for your response"
+fi
+
+echo -e | tee -a $log
+echo "---------------------------------------------" | tee -a $log
+grep -r 'Pi is roughly' ${log}
+if [ $? -eq 0 ];
+then
+ echo 'Spark services running.' | tee -a $log
+ echo 'Please check log file '$log' for more details.'
+
+else
+ echo 'Expected output not found.' | tee -a $log
+ echo 'Please check log file '$log' for more details'
+fi
+
diff --git a/utils/checkall.sh b/utils/checkall.sh
new file mode 100755
index 0000000..fc83068
--- /dev/null
+++ b/utils/checkall.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+RED='\033[0;31m'
+YEL='\033[1;33m'
+CYAN='\033[1;36m'
+GRE='\033[0;32m'
+NC='\033[0m'
+
+namenode=`hostname`
+echo -en "Check Services on NameNode ${YEL}($namenode)${NC} .. "
+dlist=`jps`
+error=0
+errmsg=""
+echo -e $dlist | grep "NameNode" >/dev/null
+if [[ $? -ne 0 ]]; then
+ error=1
+ errmsg="$errmsg NameNode,"
+fi
+echo -e $dlist | grep "ResourceManager" >/dev/null
+if [[ $? -ne 0 ]]; then
+ error=1
+ errmsg="$errmsg ResourceManager,"
+fi
+echo -e $dlist | grep "JobHistoryServer" >/dev/null
+if [[ $? -ne 0 ]]; then
+ error=1
+ errmsg="$errmsg JobHistoryServer,"
+fi
+echo -e $dlist | grep "HistoryServer" >/dev/null
+if [[ $? -ne 0 ]]; then
+ error=1
+ errmsg="$errmsg HistoryServer,"
+fi
+
+if [[ $error == 1 ]]; then
+ echo -e "${RED}NOT OK ${NC}"
+ echo -e "${CYAN}$errmsg${NC} not active in $namenode"
+else
+ echo -e "${GRE}OK${NC}"
+fi
+
+error=0
+errmsg=""
+for userhost in `cat ${HADOOP_HOME}/etc/hadoop/slaves | grep -v ^#`
+do
+ echo -en "Check Services on DataNode ${YEL}($userhost)${NC} .. "
+ dlist=`ssh $userhost jps `
+
+ echo -e $dlist | grep "DataNode" >/dev/null
+ if [[ $? -ne 0 ]]; then
+ error=1
+ errmsg="$errmsg DataNode,"
+ fi
+
+ echo -e $dlist | grep "NodeManager" >/dev/null
+ if [[ $? -ne 0 ]]; then
+ error=1
+ errmsg="$errmsg NodeManager,"
+ fi
+ if [[ $error == 1 ]]; then
+ echo -e "${RED}NOT OK ${NC}"
+ echo -e "${CYAN}$errmsg${NC} not active in $userhost"
+ else
+ echo -e "${GRE}OK${NC}"
+ fi
+done
\ No newline at end of file