diff --git a/README.md b/README.md index b4f551b..b62ab8e 100644 --- a/README.md +++ b/README.md @@ -1,308 +1,52 @@ # Hadoop and Yarn Setup -## 1. set passwordless login +### Pre-requisities: +1. JAVA Setup should be completed and JAVA_HOME should be set in the environment variable. +2. Make sure the nodes are set for password-less SSH both ways(master->slaves & slaves->master). +3. Since we use the environment variables a lot in our scripts, make sure to comment out the portion following this statement in your ~/.bashrc , +`If not running interactively, don't do anything` -To create user -``` -sudo adduser testuser -sudo adduser testuser sudo -``` +### Installations: -For local host +* To automate hadoop installation follows the steps, -``` -ssh-keygen -t rsa -P '' -f ~/.ssh/id_rsa -cat ~/.ssh/id_rsa.pub >> ~/.ssh/authorized_keys - ``` -For other hosts - -``` -ssh-copy-id -i ~/.ssh/id_rsa.pub user@host -ssh user@host -``` -## 2. Download and install hadoop - -http://hadoop.apache.org/releases.html#Download - -``` -#Choose the right mirror, below link is for US machines. -wget http://www-us.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz -tar xf hadoop-2.7.3.tar.gz --gzip -export HADOOP_HOME=$HOME/hadoop-2.7.3 -``` - -## 3. Update slaves file - -Add data nodes, don't add master node. -```bash -vi $HADOOP_HOME/etc/hadoop/slaves -user@host1 -user@host2 -``` - -## 4. Hadoop utils setup -``` -git clone https://github.com/kmadhugit/hadoop-cluster-utils.git -cd hadoop-cluster-utils -vi add-this-to-dot-profile.sh #update correct path to env variables. -. add-this-to-dot-profile.sh -``` - -check whether cluster scripts are working - -``` -AN hostname -``` - -Update .bashrc - - 1. Delete the following check. - ``` - # If not running interactively, don't do anything -case $- in - *i*) ;; - *) return;; -esac - ``` - - 2. Read add-this-to-dot-profile.sh at the end of .bashrc - - ``` - vi $HOME/.bashrc - Gi - :r $HOME/hadoop-cluster-utils/add-this-to-dot-profile.sh - G - set -o vi - ``` - - 3. copy .bashrc to all other data nodes - - ``` - CP $HOME/.bashrc $HOME - ``` - - -## 5. Install Hadoop on all nodes -``` -CP $HOME/hadoop-2.7.3.tar.gz $HOME -DN "tar xf hadoop-2.7.3.tar.gz --gzip" -``` - -## 6. HDFS configuration - -You need to modify 2 config files for HDFS - -1. core-site.xml #Modify the Hostname for the Name node - ``` - cd $HOME/hadoop-cluster-utils/conf - cp core-site.xml.template core-site.xml - vi core-site.xml - cp core-site.xml $HADOOP_HOME/etc/hadoop - CP core-site.xml $HADOOP_HOME/etc/hadoop - ``` - -2. hdfs-site.xml - - create local dir in name node for meta-data ( - - ``` mkdir -p /data/user/hdfs-meta-data ``` - - create local dir in all data-nodes for hdfs-data + ```bash + git clone https://github.com/kmadhugit/hadoop-cluster-utils - ``` DN "mkdir -p /data/user/hdfs-data" ``` - - update dir path + cd hadoop-cluster-utils ``` - cd $HOME/hadoop-cluster-utils/conf - cp hdfs-site.xml.template hdfs-site.xml - vi hdfs-site.xml #update dir path - ``` - Copy the files to all nodes - ``` - cp hdfs-site.xml $HADOOP_HOME/etc/hadoop - CP hdfs-site.xml $HADOOP_HOME/etc/hadoop - ``` +* Configuration -3. Start HDFS as fresh FS + To configure `hadoop-cluster-utils`, copy `config.sh.template` to `config.sh` and edit that file. See `config.sh.template` for detailed configuration instructions. After editing `config.sh`, execute `./run.sh` to run automate hadoop installation. - ``` -$HADOOP_PREFIX/bin/hdfs namenode -format mycluster -start-hdfs.sh -AN jps -# use stop-hdfs.sh for stopping - ``` - -4. Start HDFS on existing cluster data - You need to modify ownership to self to use already created data - - ``` - AN "sudo chown user:user /data/hdfs-meta-data" - AN "sudo chown user:user /data/hdfs-data" - start-hdfs.sh - AN jps - ``` - - Ensure that the following java process is running in master. If not, check the log files +* Ensure that the following java process is running in master. If not, check the log files ``` NameNode + JobHistoryServer + ResourceManager ``` Ensure that the following java process is running in slaves. If not, check the log files ``` DataNode + NodeManager ``` - -5. HDFS web address - - ``` - http://localhost:50070 - ``` - -## 7. Yarn configuration - -You need to modify 2 config files for HDFS - -1. capacity-scheduler.xml #Modify resource-calculator property to DominantResourceCalculator - - ```bash - vi $HADOOP_HOME/etc/hadoop/capacity-scheduler.xml - ``` - ```xml - - yarn.scheduler.capacity.resource-calculator - org.apache.hadoop.yarn.util.resource.DominantResourceCalculator - - ``` -2. yarn-site.xml # Modify the properties as per the description provided in the template - - ``` - cd $HOME/hadoop-cluster-utils/conf - cp yarn-site.xml.template yarn-site.xml - vi yarn-site.xml - cp yarn-site.xml $HADOOP_HOME/etc/hadoop - CP yarn-site.xml $HADOOP_HOME/etc/hadoop - AN jps - ``` - - Ensure that the following java process is started in master. If not, check the log files + +* HDFS, Resource Manager and Node Manager web Address ``` - JobHistoryServer - ResourceManager + HDFS web address : http://localhost:50070 + Resource Manager : http://localhost:8088/cluster + Node Manager : http://datanode:8042/node (For each node) ``` - Ensure that the following java process is started in slaves. If not, check the log files - ``` - NodeManager - ``` - -3. Start Yarn - ``` - start-yarn.sh - AN jps - ``` - -3. Resource Manager and Node Manager web Address - ``` - Resource Manager : http://localhost:8088/cluster - Node Manager : http://datanode:8042/node (For each node) - ``` - -## 8. Useful scripts - - ``` - > stop-all.sh #stop HDFS and Yarn - > start-all.sh #start HDFS and Yarn - > CP #Copy file from name nodes to all slaves - > AN #execute a given command in all nodes including master - > DN #execute a given command in all nodes excluding master - ``` - -## 9. Spark Installation. - -### a. Download Binary - -``` -http://spark.apache.org/downloads.html -#Choose the right mirror, below link is for US machines. -wget http://www-us.apache.org/dist/spark/spark-2.0.1/spark-2.0.1-bin-hadoop2.7.tgz -tar -zvf spark-2.0.1-bin-hadoop2.7.tgz -``` - -### b. Build it yourself - -``` -git clone https://github.com/apache/spark.git -git checkout -b v2.0.1 v2.0.1 -export MAVEN_OPTS="-Xmx32G -XX:MaxPermSize=8G -XX:ReservedCodeCacheSize=2G" -./build/mvn -T40 -Pyarn -Phadoop-2.7 -Dhadoop.version=2.7.3 -Phive -Phive-thriftserver -DskipTests -Dmaven.javadoc.skip=true install -``` - -### c. Test (pre-built spark version) -``` -#Add in ~/.bashrc -export SPARK_HOME=$HOME/spark-2.0.1-bin-hadoop2.7 - -. ~/.bashrc - -${SPARK_HOME}bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1024M --num-executors 2 --executor-memory 1g --executor-cores 1 ${SPARK_HOME}/examples/jars/spark-examples_2.11-2.0.1.jar 10 -``` - -### d. Test (manual spark build) - -``` -#Add in ~/.bashrc -export SPARK_HOME=$HOME/spark - -. ~/.bashrc - -$SPARK_HOME/bin/spark-submit --class org.apache.spark.examples.SparkPi --master yarn-client --driver-memory 1024M --num-executors 2 --executor-memory 1g --executor-cores 1 /home/testuser/spark/examples/target/scala-2.11/jars/spark-examples_2.11-2.0.1.jar - -``` - -### e. Enable EventLogging & additional settings by adding the following content to $SPARK_HOME/conf/spark-defaults.conf -``` -spark.eventLog.enabled true -spark.eventLog.dir /tmp/spark-events -spark.eventLog.compress true -spark.history.fs.logDirectory /tmp/spark-events -spark.serializer org.apache.spark.serializer.KryoSerializer -``` - -### f. Start/Stop All Services. - - The below scripts are used to start/stop the following services in an automated way, - - - namenode daemon (only on hdfs master) - - datanode daemon (on all slave nodes) - - resource manager daemon (only on yarn master) - - node manager daemon (on all slave nodes) - - job history server (only on yarn master) - - Spark history server (on yarn master) - -``` - # Start - - start-all.sh - # Stop +* Useful scripts - stop-all.sh -======= -``` - -## 10. Spark command line options for Yarn Scheduler. - - -| Option | Description | -|--------|-------------| -| --num-executors | Total number of executor JVMs to spawn across Yarn Cluster | -| --executor-cores | Total number of cores in each executor JVM | -| --executor-memory | Memory to be allocated for each JVM 1024M/1G| -| --driver-memory | Memory to be allocated for driver JVM | -| --driver-cores | Total number of vcores for driver JVM | -| | Total vcores = num-executors * executor-vcores + driver-cores | -| | Total Memory = num-executors * executor-memory + driver-memory | -|--driver-java-options | To pass driver JVM, useful in local mode for profiling | - ------------------------------------------------------------------ + ``` + > stop-all.sh #stop HDFS and Yarn + > start-all.sh #start HDFS and Yarn + > CP #Copy file from name nodes to all slaves + > AN #execute a given command in all nodes including master + > DN #execute a given command in all nodes excluding master + ``` \ No newline at end of file diff --git a/add-this-to-dot-profile.sh b/add-this-to-dot-profile.sh deleted file mode 100644 index 71402e2..0000000 --- a/add-this-to-dot-profile.sh +++ /dev/null @@ -1,24 +0,0 @@ - -export PATH=$HOME/hadoop-cluster-utils/utils:$HOME/hadoop-cluster-utils/hadoop:$PATH - -export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-ppc64el -export HADOOP_HOME=$HOME/hadoop-2.7.3 - - -export HADOOP_PREFIX=$HADOOP_HOME -export HADOOP_MAPRED_HOME=$HADOOP_HOME -export HADOOP_COMMON_HOME=$HADOOP_HOME -export HADOOP_HDFS_HOME=$HADOOP_HOME -export YARN_HOME=$HADOOP_HOME -export HADOOP_CONF_DIR=$HADOOP_HOME/etc/hadoop -export YARN_CONF_DIR=$HADOOP_HOME/etc/hadoop - -export PATH=${HADOOP_HOME}/bin:$PATH -set -o vi - -# Some convenient aliases and functions for running Hadoop-related commands -unalias fs &> /dev/null -alias fs="hadoop fs" -unalias hls &> /dev/null -alias hls="fs -ls" - diff --git a/conf/core-site.xml.template b/conf/core-site.xml.template index 64262ec..86b6c64 100644 --- a/conf/core-site.xml.template +++ b/conf/core-site.xml.template @@ -17,13 +17,16 @@ - - fs.defaultFS - hdfs://namenode - Name node URL - - - hadoop.tmp.dir - file:/spark1/data/baidu/tmp - - + + + hadoop.tmp.dir + HADOOP.TMP.DIR + + + + fs.defaultFS + hdfs://YARN_RESOURCEMANAGER_HOSTNAME:9000 + Name node URL + + + \ No newline at end of file diff --git a/conf/hdfs-site.xml.template b/conf/hdfs-site.xml.template index 70bdf64..2e870d4 100644 --- a/conf/hdfs-site.xml.template +++ b/conf/hdfs-site.xml.template @@ -18,21 +18,21 @@ - -dfs.replication -3 - - - -dfs.namenode.name.dir -file:/data/madhu/hdfs-meta-data -Meta data dir - can be RAM FS only on Namename - - - -dfs.datanode.data.dir -file:/data/madhu/hdfs-data -Data dir - on all data nodes - + + dfs.replication + REPLICATION_VALUE + + + + dfs.namenode.name.dir + NAMENODE_DIR + Meta data dir - can be RAM FS only on Namename + + + + dfs.datanode.data.dir + DATANODE_DIR + Data dir - on all data nodes + diff --git a/conf/yarn-site.xml.template b/conf/yarn-site.xml.template index 37f6325..e8213e9 100644 --- a/conf/yarn-site.xml.template +++ b/conf/yarn-site.xml.template @@ -16,55 +16,62 @@ - - yarn.resourcemanager.hostname - n001 - - - yarn.resourcemanager.webapp.address - 0.0.0.0:8088 - + + yarn.resourcemanager.hostname + YARN_RESOURCEMANAGER_HOSTNAME + + + + yarn.resourcemanager.webapp.address + 0.0.0.0:8088 + - - yarn.scheduler.minimum-allocation-mb - 128 - Min value for --executor-memory - - - yarn.scheduler.maximum-allocation-mb - 204800 - Max value for --executor-memory - - - yarn.scheduler.minimum-allocation-vcores - 1 - Min value for —executor-vcore - - - yarn.scheduler.maximum-allocation-vcores - 40 - Max value for —executor-vcore - + + yarn.scheduler.minimum-allocation-mb + YARN_SCHEDULER_MIN_ALLOCATION_MB + Min value for --executor-memory + + + + yarn.scheduler.maximum-allocation-mb + YARN_SCHEDULER_MAX_ALLOCATION_MB + Max value for --executor-memory + + + + yarn.scheduler.minimum-allocation-vcores + YARN_SCHEDULER_MIN_ALLOCATION_VCORES + Min value for —executor-vcore + + + + yarn.scheduler.maximum-allocation-vcores + YARN_SCHEDULER_MAX_ALLOCATION_VCORES + Max value for —executor-vcore + - - yarn.nodemanager.resource.cpu-vcores - 160 - Vcore capacity of this node - - - yarn.nodemanager.resource.memory-mb - 204800 - Memory Capacity of this node - - - yarn.nodemanager.vmem-check-enabled - false - - - yarn.nodemanager.pmem-check-enabled - false - + + yarn.nodemanager.resource.cpu-vcores + YARN_NODEMANAGER_RESOURCE_CPU_VCORES + Vcore capacity of this node + + + + yarn.nodemanager.resource.memory-mb + YARN_NODEMANAGER_RESOURCE_MEMORY_MB + Memory Capacity of this node + + + + yarn.nodemanager.vmem-check-enabled + false + + + + yarn.nodemanager.pmem-check-enabled + false + diff --git a/config.sh.template b/config.sh.template new file mode 100644 index 0000000..36fd1b9 --- /dev/null +++ b/config.sh.template @@ -0,0 +1,25 @@ + +# Default hdfs configuration properties + +HADOOP_TMP_DIR=/home/testuser/app/tmp +REPLICATION_FACTOR=3 +DFS_NAMENODE_NAME_DIR=/home/testuser/data/hdfs-meta +DFS_DATANODE_NAME_DIR=/home/testuser/data/hdfs-data + +# Site specific YARN configuration properties + +MASTER=10.88.67.158 +SLAVES=10.88.67.158,19,40g%10.88.67.113,20,50g%10.88.67.150,19,40g +# Use this format to set SLAVE IPs : slave1IP,slave1cpu,slave1memory%slave2IP,slave2cpu,slave2memory%....%slaveNIP,slaveNcpu,slaveNmemory + +# Scheduler properties + +YARN_SCHEDULER_MIN_ALLOCATION_MB=128 +YARN_SCHEDULER_MAX_ALLOCATION_MB=4048 +YARN_SCHEDULER_MIN_ALLOCATION_VCORES=1 +YARN_SCHEDULER_MAX_ALLOCATION_VCORES=4 + +# Node Manager properties (Default yarn cpu and memory value for all nodes) + +YARN_NODEMANAGER_RESOURCE_CPU_VCORES=4 +YARN_NODEMANAGER_RESOURCE_MEMORY_MB=4048 diff --git a/run.sh b/run.sh new file mode 100755 index 0000000..b553ee2 --- /dev/null +++ b/run.sh @@ -0,0 +1,202 @@ +#!/bin/bash -l + +# Need to create user manually +# Need to set JAVA_HOME + +temp=0 +temp=$(echo $JAVA_HOME) + +if [ $temp -eq 0 ]; +then + echo "JAVA_HOME not found in your environment, please set the JAVA_HOME variable in your environment then continue to run this script." + exit 1 +else + echo "JAVA_HOME found" +fi + +CURDIR=`pwd` # Inside hadoop-cluster-utils directory where run.sh is exist +WORKDIR=${HOME} # where hadoop and spark package will download + +# Validation for config file + +if [ -f ${CURDIR}/config.sh ]; +then + # First time permission set for config.sh file + chmod +x config.sh + source config.sh + + # Checking config file whether all fileds are filled + + { cat ${CURDIR}/hello; echo; } | while read -r line; do + if [[ $line =~ "=" ]] ; + then + confvalue=`echo $line |grep = | cut -d "=" -f2` + if [[ -z "$confvalue" ]]; + then + echo "Configuration vlaue not set properly for $line, please check config.sh file" + exit + fi + fi + done + + # Slicing MASTERIP + MASTERIP=`cat ${CURDIR}/config.sh | grep MASTER | cut -d "=" -f2` + + # Counting number of SLAVEIP + SLAVELIST=`cat ${CURDIR}/config.sh | grep SLAVES | cut -d "=" -f2 | tr "%" "\n" | wc -l` + + # Slicing SLAVEIP in list and save into slaves file + declare -a SLAVEIP=() + cd ${CURDIR}/conf + rm slaves + for (( i=1; i<${SLAVELIST}+1; i++ )); + do + echo "`cat ${CURDIR}/config.sh | grep SLAVES | cut -d "=" -f2 | cut -d'%' -f$i | cut -d',' -f1`" >> ${CURDIR}/conf/slaves + done + + # Validation for IP + while IFS= read -r ip; do + if ping -q -c2 "$ip" &>/dev/null; + then + echo "$ip is Pingable" + else + echo "$ip Not Pingable" + echo "Please check your config.sh file. $ip is not pingalbe." + exit 1 + fi + done <${CURDIR}/conf/slaves + + + # Download and install hadoop + + echo -n "Download and install hadoop ... " + cd ${WORKDIR} + if [ ! -d ${WORKDIR}/hadoop-2.7.3 ]; + then + wget http://www-us.apache.org/dist/hadoop/common/hadoop-2.7.3/hadoop-2.7.3.tar.gz + tar xf hadoop-2.7.3.tar.gz --gzip + + # export path to the .bashrc file + grep "CURDIR" $HOME/.bashrc + if [ $? -ne 0 ]; + then + echo "export CURDIR="${CURDIR}"" >> $HOME/.bashrc + echo "export PATH="${CURDIR}"/CURDIR:"${CURDIR}"/hadoop:$PATH" >> $HOME/.bashrc + echo "export HADOOP_HOME="${WORKDIR}"/hadoop-2.7.3" >> $HOME/.bashrc + echo "export HADOOP_PREFIX=\$HADOOP_HOME" >> $HOME/.bashrc + echo "export HADOOP_MAPRED_HOME=\$HADOOP_HOME" >> $HOME/.bashrc + echo "export HADOOP_COMMON_HOME=\$HADOOP_HOME" >> $HOME/.bashrc + echo "export HADOOP_HDFS_HOME=\$HADOOP_HOME" >> $HOME/.bashrc + echo "export YARN_HOME=\$HADOOP_HOME" >> $HOME/.bashrc + echo "export HADOOP_CONF_DIR=\$HADOOP_HOME/etc/hadoop" >> $HOME/.bashrc + echo "export YARN_CONF_DIR=\$HADOOP_HOME/etc/hadoop" >> $HOME/.bashrc + echo "export PATH="$HADOOP_HOME"/bin:$PATH" >> $HOME/.bashrc + fi + + source $HOME/.bashrc + + # copy .bashrc to all other data nodes + #CP $HOME/.bashrc $HOME + #CP ${WORKDIR}/hadoop-2.7.3.tar.gz ${WORKDIR} + #DN "tar xf hadoop-2.7.3.tar.gz --gzip" + #scp -r /path/to/file username@hostname:/path/to/destination + + echo "Started configuration properties in hadoop CURDIR" + + if [ ! -f ${CURDIR}/conf/core-site.xml ]; + then + cp ${CURDIR}/conf/core-site.xml.template ${CURDIR}/conf/core-site.xml + cp ${CURDIR}/conf/hdfs-site.xml.template ${CURDIR}/conf/hdfs-site.xml + cp ${CURDIR}/conf/yarn-site.xml.template ${CURDIR}/conf/yarn-site.xml + + # Copy slaves file into HADOOP_HOME + cp ${CURDIR}/conf/slaves $HADOOP_HOME/etc/hadoop + #CP ${CURDIR}/conf/slaves $HADOOP_HOME/etc/hadoop + + # core-site.xml configuration configuration properties + sed -i 's|HADOOP.TMP.DIR|'"$HADOOP_TMP_DIR"'|g' ${CURDIR}/conf/core-site.xml + sed -i 's|YARN_RESOURCEMANAGER_HOSTNAME|'"$YARN_RESOURCEMANAGER_HOSTNAME"'|g' ${CURDIR}/conf/core-site.xml + cp ${CURDIR}/conf/core-site.xml $HADOOP_HOME/etc/hadoop + #CP ${CURDIR}/conf/core-site.xml $HADOOP_HOME/etc/hadoop + + # hdfs-site.xml configuration properties + sed -i 's|REPLICATION_VALUE|'"$REPLICATION_FACTOR"'|g' ${CURDIR}/conf/hdfs-site.xml + sed -i 's|NAMENODE_DIR|'"$DFS_NAMENODE_NAME_DIR"'|g' ${CURDIR}/conf/hdfs-site.xml + sed -i 's|DATANODE_DIR|'"$DFS_DATANODE_NAME_DIR"'|g' ${CURDIR}/conf/hdfs-site.xml + cp ${CURDIR}/conf/hdfs-site.xml $HADOOP_HOME/etc/hadoop + #CP ${CURDIR}/conf/hdfs-site.xml $HADOOP_HOME/etc/hadoop + + # yarn-site.xml configuration properties + sed -i 's|YARN_RESOURCEMANAGER_HOSTNAME|'"$YARN_RESOURCEMANAGER_HOSTNAME"'|g' ${CURDIR}/conf/yarn-site.xml + sed -i 's|YARN_SCHEDULER_MIN_ALLOCATION_MB|'"$YARN_SCHEDULER_MIN_ALLOCATION_MB"'|g' ${CURDIR}/conf/yarn-site.xml + sed -i 's|YARN_SCHEDULER_MAX_ALLOCATION_MB|'"$YARN_SCHEDULER_MAX_ALLOCATION_MB"'|g' ${CURDIR}/conf/yarn-site.xml + sed -i 's|YARN_SCHEDULER_MIN_ALLOCATION_VCORES|'"$YARN_SCHEDULER_MIN_ALLOCATION_VCORES"'|g' ${CURDIR}/conf/yarn-site.xml + sed -i 's|YARN_SCHEDULER_MAX_ALLOCATION_VCORES|'"$YARN_SCHEDULER_MAX_ALLOCATION_VCORES"'|g' ${CURDIR}/conf/yarn-site.xml + sed -i 's|YARN_NODEMANAGER_RESOURCE_CPU_VCORES|'"$YARN_NODEMANAGER_RESOURCE_CPU_VCORES"'|g' ${CURDIR}/conf/yarn-site.xml + sed -i 's|YARN_NODEMANAGER_RESOURCE_MEMORY_MB|'"$YARN_NODEMANAGER_RESOURCE_MEMORY_MB"'|g' ${CURDIR}/conf/yarn-site.xml + cp ${CURDIR}/conf/yarn-site.xml $HADOOP_HOME/etc/hadoop + #CP ${CURDIR}/conf/yarn-site.xml $HADOOP_HOME/etc/hadoop + + echo "Finished configuration properties in hadoop CURDIR and copied to $HADOOP_HOME/etc/hadoop" + fi + + # Change the JAVA_HOME variable in hadoop-env.sh + sed -i 's|${JAVA_HOME}|'"${JAVA_HOME}"'|g' $HADOOP_HOME/etc/hadoop/hadoop-env.sh + + echo "Started creating directories" + + + if [ ! -d "$HADOOP_TMP_DIR" ]; + then + # Creating directories + mkdir -p $HADOOP_TMP_DIR + #DN "mkdir -p $HADOOP_TMP_DIR" + mkdir -p $DFS_NAMENODE_NAME_DIR + #DN "mkdir -p $DFS_NAMENODE_NAME_DIR" + mkdir -p $DFS_DATANODE_NAME_DIR + #DN "mkdir -p $DFS_DATANODE_NAME_DIR" + echo "Finished creating directories" + + echo "Formated NAMENODE" + $HADOOP_PREFIX/bin/hdfs namenode -format mycluster + fi + fi +else + echo "Config file does not exist. Please check README.md for installation steps." + exit 1 +fi # Line 54 if condtion + +$HADOOP_PREFIX/sbin/start-all.sh +# use stop-all.sh for stopping + + +# Ensure all nodes are running correctly. +AN jps +echo "HDFS web address : http://localhost:50070" +echo "Resource Manager : http://localhost:8088/cluster" +echo "Node Manager : http://datanode:8042/node (For each node)" + +# Spark installation + +echo -n "Download and install Spark ... " +cd ${WORKDIR} +if [ ! -d ${WORKDIR}/spark-2.0.1-bin-hadoop2.7 ]; +then + wget http://www-us.apache.org/dist/spark/spark-2.0.1/spark-2.0.1-bin-hadoop2.7.tgz + tar xf spark-2.0.1-bin-hadoop2.7.tgz --gzip +fi + +echo "Export SPARK_HOME to the PATH" + +# Add scripts to the PATH +grep "SPARK_HOME" ~/.bashrc +if [ $? -ne 0 ]; then + echo "export SPARK_HOME="${WORKDIR}"/spark-2.0.1-bin-hadoop2.7" >> ~/.bashrc + echo "export PATH=\$SPARK_HOME/bin:$PATH" >> ~/.bashrc +fi + +source $HOME/.bashrc + +echo "Finished export SPARK_HOME into .bashrc file" +echo "Spark installation done..!!" +echo "Fully completed..!!"