diff --git a/README.md b/README.md index 523ecd1a..433594d9 100644 --- a/README.md +++ b/README.md @@ -197,6 +197,15 @@ EC2. These scripts are intended to be used by the default Spark AMI and is *not* expected to work on other AMIs. If you wish to start a cluster using Spark, please refer to http://spark-project.org/docs/latest/ec2-scripts.html +## Using S3 with Hadoop 2.6 or newer + +Starting Hadoop 2.6.0, s3 FS connector has been moved to a separate library called hadoop-aws. + +- In order to make the package available add it as a dependency, `libraryDependencies += "org.apache.hadoop" % "hadoop-aws" % "2.6.4"`. +- It can also be added it directly to spark-submit, `spark-submit --packages org.apache.hadoop:hadoop-aws:2.6.4 SimpleApp.py`. + +On other related note, it is recommended to use `s3a` and not `s3n` filesystem starting Hadoop 2.6.0. + ## spark-ec2 Internals The Spark cluster setup is guided by the values set in `ec2-variables.sh`.`setup.sh` @@ -237,3 +246,4 @@ after the templates have been configured. You can use the environment variables get a list of slave hostnames and `/root/spark-ec2/copy-dir` to sync a directory across machines. 5. Modify `spark_ec2.py` to add your module to the list of enabled modules. + diff --git a/deploy.generic/root/spark-ec2/ec2-variables.sh b/deploy.generic/root/spark-ec2/ec2-variables.sh index 4f3e8da8..b170c9c4 100644 --- a/deploy.generic/root/spark-ec2/ec2-variables.sh +++ b/deploy.generic/root/spark-ec2/ec2-variables.sh @@ -27,6 +27,7 @@ export MODULES="{{modules}}" export SPARK_VERSION="{{spark_version}}" export TACHYON_VERSION="{{tachyon_version}}" export HADOOP_MAJOR_VERSION="{{hadoop_major_version}}" +export HADOOP_MINOR_VERSION="{{hadoop_minor_version}}" export SWAP_MB="{{swap}}" export SPARK_WORKER_INSTANCES="{{spark_worker_instances}}" export SPARK_MASTER_OPTS="{{spark_master_opts}}" diff --git a/deploy_templates.py b/deploy_templates.py index 895e55a4..883cda4c 100755 --- a/deploy_templates.py +++ b/deploy_templates.py @@ -73,6 +73,7 @@ "spark_version": os.getenv("SPARK_VERSION"), "tachyon_version": os.getenv("TACHYON_VERSION"), "hadoop_major_version": os.getenv("HADOOP_MAJOR_VERSION"), + "hadoop_minor_version": os.getenv("HADOOP_MINOR_VERSION"), "java_home": os.getenv("JAVA_HOME"), "default_tachyon_mem": "%dMB" % tachyon_mb, "system_ram_mb": "%d" % system_ram_mb, diff --git a/ephemeral-hdfs/init.sh b/ephemeral-hdfs/init.sh index 0e18bca8..d3d97099 100755 --- a/ephemeral-hdfs/init.sh +++ b/ephemeral-hdfs/init.sh @@ -30,11 +30,27 @@ case "$HADOOP_MAJOR_VERSION" in cp /root/hadoop-native/* /root/ephemeral-hdfs/lib/native/ ;; yarn) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-2.4.0/ ephemeral-hdfs/ + if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.4.0/ ephemeral-hdfs/ + elif [[ "$HADOOP_MINOR_VERSION" == "2.6" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.6.5.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.6.0/ ephemeral-hdfs/ + elif [[ "$HADOOP_MINOR_VERSION" == "2.7" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.7.3.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.7.0/ ephemeral-hdfs/ + else + echo "ERROR: Unknown Hadoop version" + fi # Have single conf dir rm -rf /root/ephemeral-hdfs/etc/hadoop/ diff --git a/persistent-hdfs/init.sh b/persistent-hdfs/init.sh index 735cebcc..5224835b 100755 --- a/persistent-hdfs/init.sh +++ b/persistent-hdfs/init.sh @@ -29,11 +29,27 @@ case "$HADOOP_MAJOR_VERSION" in cp /root/hadoop-native/* /root/persistent-hdfs/lib/native/ ;; yarn) - wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz - echo "Unpacking Hadoop" - tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log - rm hadoop-*.tar.gz - mv hadoop-2.4.0/ persistent-hdfs/ + if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.4.0.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.4.0/ persistent-hdfs/ + elif [[ "$HADOOP_MINOR_VERSION" == "2.6" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.6.5.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.6.0/ persistent-hdfs/ + elif [[ "$HADOOP_MINOR_VERSION" == "2.7" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/hadoop-2.7.3.tar.gz + echo "Unpacking Hadoop" + tar xvzf hadoop-*.tar.gz > /tmp/spark-ec2_hadoop.log + rm hadoop-*.tar.gz + mv hadoop-2.7.0/ persistent-hdfs/ + else + echo "ERROR: Unknown Hadoop version" + fi # Have single conf dir rm -rf /root/persistent-hdfs/etc/hadoop/ diff --git a/scala/init.sh b/scala/init.sh index 73a299f5..8d0a3ac7 100755 --- a/scala/init.sh +++ b/scala/init.sh @@ -11,10 +11,15 @@ SCALA_VERSION="2.10.3" if [[ "0.7.3 0.8.0 0.8.1" =~ $SPARK_VERSION ]]; then SCALA_VERSION="2.9.3" + wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz +elif [[ "2.0.0" =~ $SPARK_VERSION ]]; then + SCALA_VERSION="2.11.8" + wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz +else + wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz fi echo "Unpacking Scala" -wget http://s3.amazonaws.com/spark-related-packages/scala-$SCALA_VERSION.tgz tar xvzf scala-*.tgz > /tmp/spark-ec2_scala.log rm scala-*.tgz mv `ls -d scala-* | grep -v ec2` scala diff --git a/spark/init.sh b/spark/init.sh index 71fbc7bf..b096b07b 100755 --- a/spark/init.sh +++ b/spark/init.sh @@ -24,119 +24,107 @@ then # Pre-packaged spark version: else - case "$SPARK_VERSION" in + case "$SPARK_VERSION" in 0.7.3) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-0.7.3-prebuilt-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-0.7.3-prebuilt-cdh4.tgz - fi - ;; - 0.8.0) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.0-incubating-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.0-incubating-bin-cdh4.tgz - fi - ;; - 0.8.1) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.1-incubating-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-0.8.1-incubating-bin-cdh4.tgz - fi - ;; - 0.9.0) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.0-incubating-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.0-incubating-bin-cdh4.tgz - fi - ;; - 0.9.1) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.1-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.1-bin-cdh4.tgz - fi - ;; - 0.9.2) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.2-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-0.9.2-bin-cdh4.tgz - fi - ;; - 1.0.0) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.0-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.0-bin-cdh4.tgz - fi - ;; - 1.0.1) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.1-bin-hadoop1.tgz - else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.1-bin-cdh4.tgz - fi - ;; - 1.0.2) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.2-bin-hadoop1.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-prebuilt-hadoop1.tgz + elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-prebuilt-cdh4.tgz else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.0.2-bin-cdh4.tgz + echo "ERROR: Unsupported Hadoop major version" + return 1 fi - ;; - 1.1.0) - if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop1.tgz + ;; + 0\.8\.0|0\.8\.1|0\.9\.0) + if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-incubating-bin-hadoop1.tgz elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-cdh4.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-incubating-bin-cdh4.tgz else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.0-bin-hadoop2.4.tgz + echo "ERROR: Unsupported Hadoop major version" + return 1 fi - ;; - 1.1.1) + ;; + # 0.9.1 - 1.0.2 + 0.9.1|1\.0\.[0-2]) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.1-bin-hadoop1.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.1-bin-cdh4.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.1.1-bin-hadoop2.4.tgz + echo "ERROR: Unsupported Hadoop major version" + return 1 fi - ;; - 1.2.0) + ;; + # 1.1.0 - 1.3.0 + 1\.[1-2]\.[0-9]*|1\.3\.0) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.0-bin-hadoop1.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.0-bin-cdh4.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz + elif [[ "$HADOOP_MAJOR_VERSION" == "yarn" ]]; then + if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz + else + echo "ERROR: Unknown Hadoop minor version" + return 1 + fi else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.0-bin-hadoop2.4.tgz + echo "ERROR: Unsupported Hadoop major version" + return 1 fi - ;; - 1.2.1) + ;; + # 1.3.1 - 1.6.2 + 1\.[3-6]\.[0-2]) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-hadoop1.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-cdh4.tgz + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz + elif [[ "$HADOOP_MAJOR_VERSION" == "yarn" ]]; then + if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz + elif [[ "$HADOOP_MINOR_VERSION" == "2.6" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.6.tgz + else + echo "ERROR: Unknown Hadoop minor version" + return 1 + fi else - wget http://s3.amazonaws.com/spark-related-packages/spark-1.2.1-bin-hadoop2.4.tgz + echo "ERROR: Unsupported Hadoop major version" + return 1 fi - ;; - *) + ;; + # 2.0.0 - 2.0.1 + 2\.0\.[0-1]|2\.0\.0-preview) if [[ "$HADOOP_MAJOR_VERSION" == "1" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop1.tgz + echo "ERROR: Unknown Hadoop major version" + return 1 elif [[ "$HADOOP_MAJOR_VERSION" == "2" ]]; then - wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-cdh4.tgz + echo "ERROR: Unknown Hadoop major version" + return 1 + elif [[ "$HADOOP_MAJOR_VERSION" == "yarn" ]]; then + if [[ "$HADOOP_MINOR_VERSION" == "2.4" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz + elif [[ "$HADOOP_MINOR_VERSION" == "2.6" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.6.tgz + elif [[ "$HADOOP_MINOR_VERSION" == "2.7" ]]; then + wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.7.tgz + else + echo "ERROR: Unknown Hadoop version" + return 1 + fi else - wget http://s3.amazonaws.com/spark-related-packages/spark-$SPARK_VERSION-bin-hadoop2.4.tgz + echo "ERROR: Unsupported Hadoop major version" + return 1 fi + ;; + *) if [ $? != 0 ]; then echo "ERROR: Unknown Spark version" - return -1 + return 1 fi - esac + ;; + esac echo "Unpacking Spark" tar xvzf spark-*.tgz > /tmp/spark-ec2_spark.log diff --git a/spark_ec2.py b/spark_ec2.py index 075c863d..67c1e2ff 100644 --- a/spark_ec2.py +++ b/spark_ec2.py @@ -82,6 +82,12 @@ "2.0.1" ]) +VALID_HADOOP_MINOR_VERSIONS = set([ + "2.4", + "2.6", + "2.7" +]) + SPARK_TACHYON_MAP = { "1.0.0": "0.4.1", "1.0.1": "0.4.1", @@ -241,7 +247,11 @@ def parse_args(): parser.add_option( "--hadoop-major-version", default="yarn", help="Major version of Hadoop. Valid options are 1 (Hadoop 1.0.4), 2 (CDH 4.2.0), yarn " + - "(Hadoop 2.4.0) (default: %default)") + "(Hadoop 2.x) (default: %default)") + parser.add_option( + "--hadoop-minor-version", default="2.4", + help="Minor version of Hadoop. Valid options are 2.4 (Hadoop 2.4.0), 2.6 (Hadoop 2.6.0) and 2.7 (Hadoop 2.7.0). " + + "This only has any effect if yarn is specified as Hadoop major version/ (default: %default)") parser.add_option( "-D", metavar="[ADDRESS:]PORT", dest="proxy_port", help="Use SSH dynamic port forwarding to create a SOCKS proxy at " + @@ -371,18 +381,42 @@ def get_or_make_group(conn, name, vpc_id): print("Creating security group " + name) return conn.create_security_group(name, "Spark EC2 group", vpc_id) -def validate_spark_hadoop_version(spark_version, hadoop_version): + +def validate_spark_hadoop_version(spark_version, hadoop_version, hadoop_minor_version): if "." in spark_version: - parts = spark_version.split(".") - if parts[0].isdigit(): - spark_major_version = float(parts[0]) - if spark_major_version > 1.0 and hadoop_version != "yarn": - print("Spark version: {v}, does not support Hadoop version: {hv}". - format(v=spark_version, hv=hadoop_version), file=stderr) - sys.exit(1) + if hadoop_version == "1" or hadoop_version == "2": + if spark_version >= "2.0.0": + print("Spark version: {v}, does not support Hadoop major version: {hv}". + format(v=spark_version, hv=hadoop_version)) + sys.exit(1) + elif hadoop_version == "yarn": + if spark_version < "1.0.2": + print("Spark version: {v}, does not support Hadoop major version: {hv}". + format(v=spark_version, hv=hadoop_version)) + sys.exit(1) + + if hadoop_minor_version not in VALID_HADOOP_MINOR_VERSIONS: + print("Spark version: {v}, does not support Hadoop minor version: {hm}, supported minor versions: {sv}". + format(v=spark_version, hm=hadoop_minor_version, sv=",".join(VALID_HADOOP_MINOR_VERSIONS))) + sys.exit(1) + + if hadoop_minor_version == "2.6" and spark_version < "1.3.1": + print("Spark version: {v}, does not support Hadoop minor version: {hm}". + format(v=spark_version, hm=hadoop_minor_version)) + sys.exit(1) + + if hadoop_minor_version == "2.7" and spark_version < "2.0.0": + print("Spark version: {v}, does not support Hadoop minor version: {hm}". + format(v=spark_version, hm=hadoop_minor_version)) + sys.exit(1) else: - print("Invalid Spark version: {v}".format(v=spark_version), file=stderr) - sys.exit(1) + print("Invalid Hadoop version: {hv}". + format(hv=hadoop_version)) + + else: + print("Invalid Spark version: {v}".format(v=spark_version)) + sys.exit(1) + def get_validate_spark_version(version, repo): if "." in version: @@ -1086,7 +1120,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): if "." in opts.spark_version: # Pre-built Spark deploy spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) + validate_spark_hadoop_version(spark_v, opts.hadoop_major_version, opts.hadoop_minor_version) tachyon_v = get_tachyon_version(spark_v) else: # Spark-only custom deploy @@ -1113,6 +1147,7 @@ def deploy_files(conn, root_dir, opts, master_nodes, slave_nodes, modules): "spark_version": spark_v, "tachyon_version": tachyon_v, "hadoop_major_version": opts.hadoop_major_version, + "hadoop_minor_version": opts.hadoop_minor_version, "spark_worker_instances": worker_instances_str, "spark_master_opts": opts.master_opts } @@ -1297,7 +1332,7 @@ def real_main(): # Input parameter validation spark_v = get_validate_spark_version(opts.spark_version, opts.spark_git_repo) - validate_spark_hadoop_version(spark_v, opts.hadoop_major_version) + validate_spark_hadoop_version(spark_v, opts.hadoop_major_version, opts.hadoop_minor_version) if opts.wait is not None: # NOTE: DeprecationWarnings are silent in 2.7+ by default. diff --git a/tachyon/init.sh b/tachyon/init.sh index d5f1e481..b01c71b1 100755 --- a/tachyon/init.sh +++ b/tachyon/init.sh @@ -13,6 +13,9 @@ then # Not yet supported echo "Tachyon git hashes are not yet supported. Please specify a Tachyon release version." # Pre-package tachyon version +if [[ "$HADOOP_MAJOR_VERSION" == "yarn" || "$SPARK_VERSION" > "2" ]] +then + echo "Tachyon is not supported with yarn or Spark 2.0.0 and newer." else case "$TACHYON_VERSION" in 0.3.0)