diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bef0727 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target/ +.idea diff --git a/adamic-adar/.mvn/wrapper/MavenWrapperDownloader.java b/adamic-adar/.mvn/wrapper/MavenWrapperDownloader.java new file mode 100644 index 0000000..e76d1f3 --- /dev/null +++ b/adamic-adar/.mvn/wrapper/MavenWrapperDownloader.java @@ -0,0 +1,117 @@ +/* + * Copyright 2007-present the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.net.*; +import java.io.*; +import java.nio.channels.*; +import java.util.Properties; + +public class MavenWrapperDownloader { + + private static final String WRAPPER_VERSION = "0.5.6"; + /** + * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided. + */ + private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/" + + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar"; + + /** + * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to + * use instead of the default one. + */ + private static final String MAVEN_WRAPPER_PROPERTIES_PATH = + ".mvn/wrapper/maven-wrapper.properties"; + + /** + * Path where the maven-wrapper.jar will be saved to. + */ + private static final String MAVEN_WRAPPER_JAR_PATH = + ".mvn/wrapper/maven-wrapper.jar"; + + /** + * Name of the property which should be used to override the default download url for the wrapper. + */ + private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl"; + + public static void main(String args[]) { + System.out.println("- Downloader started"); + File baseDirectory = new File(args[0]); + System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath()); + + // If the maven-wrapper.properties exists, read it and check if it contains a custom + // wrapperUrl parameter. + File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH); + String url = DEFAULT_DOWNLOAD_URL; + if(mavenWrapperPropertyFile.exists()) { + FileInputStream mavenWrapperPropertyFileInputStream = null; + try { + mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile); + Properties mavenWrapperProperties = new Properties(); + mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream); + url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url); + } catch (IOException e) { + System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'"); + } finally { + try { + if(mavenWrapperPropertyFileInputStream != null) { + mavenWrapperPropertyFileInputStream.close(); + } + } catch (IOException e) { + // Ignore ... + } + } + } + System.out.println("- Downloading from: " + url); + + File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH); + if(!outputFile.getParentFile().exists()) { + if(!outputFile.getParentFile().mkdirs()) { + System.out.println( + "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'"); + } + } + System.out.println("- Downloading to: " + outputFile.getAbsolutePath()); + try { + downloadFileFromURL(url, outputFile); + System.out.println("Done"); + System.exit(0); + } catch (Throwable e) { + System.out.println("- Error downloading"); + e.printStackTrace(); + System.exit(1); + } + } + + private static void downloadFileFromURL(String urlString, File destination) throws Exception { + if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) { + String username = System.getenv("MVNW_USERNAME"); + char[] password = System.getenv("MVNW_PASSWORD").toCharArray(); + Authenticator.setDefault(new Authenticator() { + @Override + protected PasswordAuthentication getPasswordAuthentication() { + return new PasswordAuthentication(username, password); + } + }); + } + URL website = new URL(urlString); + ReadableByteChannel rbc; + rbc = Channels.newChannel(website.openStream()); + FileOutputStream fos = new FileOutputStream(destination); + fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); + fos.close(); + rbc.close(); + } + +} diff --git a/adamic-adar/.mvn/wrapper/maven-wrapper.jar b/adamic-adar/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 0000000..2cc7d4a Binary files /dev/null and b/adamic-adar/.mvn/wrapper/maven-wrapper.jar differ diff --git a/adamic-adar/.mvn/wrapper/maven-wrapper.properties b/adamic-adar/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..ffdc10e --- /dev/null +++ b/adamic-adar/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,2 @@ +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.1/apache-maven-3.8.1-bin.zip +wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar diff --git a/adamic-adar/dependency-reduced-pom.xml b/adamic-adar/dependency-reduced-pom.xml new file mode 100644 index 0000000..2c71361 --- /dev/null +++ b/adamic-adar/dependency-reduced-pom.xml @@ -0,0 +1,341 @@ + + + 4.0.0 + org.hua + adamicadar + AdamicAdar + 0.1 + http://maven.apache.org + + + + maven-compiler-plugin + 2.5.1 + + 8 + 8 + + + + maven-shade-plugin + 2.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + junit + junit + 4.8.2 + test + + + org.apache.hadoop + hadoop-client + 3.2.0 + provided + + + servlet-api + javax.servlet + + + hadoop-common + org.apache.hadoop + + + hadoop-hdfs-client + org.apache.hadoop + + + hadoop-yarn-api + org.apache.hadoop + + + hadoop-yarn-client + org.apache.hadoop + + + hadoop-mapreduce-client-core + org.apache.hadoop + + + hadoop-mapreduce-client-jobclient + org.apache.hadoop + + + hadoop-annotations + org.apache.hadoop + + + + + org.apache.spark + spark-core_2.13 + 3.2.0 + provided + + + scala-parallel-collections_2.13 + org.scala-lang.modules + + + avro + org.apache.avro + + + avro-mapred + org.apache.avro + + + chill_2.13 + com.twitter + + + chill-java + com.twitter + + + xbean-asm9-shaded + org.apache.xbean + + + hadoop-client-api + org.apache.hadoop + + + hadoop-client-runtime + org.apache.hadoop + + + spark-launcher_2.13 + org.apache.spark + + + spark-kvstore_2.13 + org.apache.spark + + + spark-network-common_2.13 + org.apache.spark + + + spark-network-shuffle_2.13 + org.apache.spark + + + spark-unsafe_2.13 + org.apache.spark + + + activation + javax.activation + + + curator-recipes + org.apache.curator + + + zookeeper + org.apache.zookeeper + + + jakarta.servlet-api + jakarta.servlet + + + commons-codec + commons-codec + + + commons-lang3 + org.apache.commons + + + commons-math3 + org.apache.commons + + + commons-text + org.apache.commons + + + commons-io + commons-io + + + commons-collections + commons-collections + + + slf4j-api + org.slf4j + + + jul-to-slf4j + org.slf4j + + + jcl-over-slf4j + org.slf4j + + + log4j + log4j + + + slf4j-log4j12 + org.slf4j + + + compress-lzf + com.ning + + + snappy-java + org.xerial.snappy + + + lz4-java + org.lz4 + + + zstd-jni + com.github.luben + + + RoaringBitmap + org.roaringbitmap + + + commons-net + commons-net + + + scala-xml_2.13 + org.scala-lang.modules + + + scala-library + org.scala-lang + + + scala-reflect + org.scala-lang + + + json4s-jackson_2.13 + org.json4s + + + jersey-client + org.glassfish.jersey.core + + + jersey-common + org.glassfish.jersey.core + + + jersey-server + org.glassfish.jersey.core + + + jersey-container-servlet + org.glassfish.jersey.containers + + + jersey-container-servlet-core + org.glassfish.jersey.containers + + + jersey-hk2 + org.glassfish.jersey.inject + + + netty-all + io.netty + + + stream + com.clearspring.analytics + + + metrics-core + io.dropwizard.metrics + + + metrics-jvm + io.dropwizard.metrics + + + metrics-json + io.dropwizard.metrics + + + metrics-graphite + io.dropwizard.metrics + + + metrics-jmx + io.dropwizard.metrics + + + jackson-module-scala_2.13 + com.fasterxml.jackson.module + + + ivy + org.apache.ivy + + + oro + oro + + + pyrolite + net.razorvine + + + py4j + net.sf.py4j + + + spark-tags_2.13 + org.apache.spark + + + commons-crypto + org.apache.commons + + + unused + org.spark-project.spark + + + + + + UTF-8 + + + diff --git a/adamic-adar/mvnw b/adamic-adar/mvnw new file mode 100755 index 0000000..a16b543 --- /dev/null +++ b/adamic-adar/mvnw @@ -0,0 +1,310 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Mingw, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`which java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +########################################################################################## +# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central +# This allows using the maven wrapper in projects that prohibit checking in binary data. +########################################################################################## +if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found .mvn/wrapper/maven-wrapper.jar" + fi +else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." + fi + if [ -n "$MVNW_REPOURL" ]; then + jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + else + jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + fi + while IFS="=" read key value; do + case "$key" in (wrapperUrl) jarUrl="$value"; break ;; + esac + done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" + if [ "$MVNW_VERBOSE" = true ]; then + echo "Downloading from: $jarUrl" + fi + wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" + if $cygwin; then + wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` + fi + + if command -v wget > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found wget ... using wget" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + wget "$jarUrl" -O "$wrapperJarPath" + else + wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" + fi + elif command -v curl > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found curl ... using curl" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + curl -o "$wrapperJarPath" "$jarUrl" -f + else + curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f + fi + + else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Falling back to using Java to download" + fi + javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" + # For Cygwin, switch paths to Windows format before running javac + if $cygwin; then + javaClass=`cygpath --path --windows "$javaClass"` + fi + if [ -e "$javaClass" ]; then + if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Compiling MavenWrapperDownloader.java ..." + fi + # Compiling the Java class + ("$JAVA_HOME/bin/javac" "$javaClass") + fi + if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + # Running the downloader + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Running MavenWrapperDownloader.java ..." + fi + ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") + fi + fi + fi +fi +########################################################################################## +# End of extension +########################################################################################## + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +if [ "$MVNW_VERBOSE" = true ]; then + echo $MAVEN_PROJECTBASEDIR +fi +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +# Provide a "standardized" way to retrieve the CLI args that will +# work with both Windows and non-Windows executions. +MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" +export MAVEN_CMD_LINE_ARGS + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/adamic-adar/pom.xml b/adamic-adar/pom.xml new file mode 100644 index 0000000..65631d3 --- /dev/null +++ b/adamic-adar/pom.xml @@ -0,0 +1,95 @@ + + 4.0.0 + + org.hua + adamicadar + 0.1 + jar + + AdamicAdar + http://maven.apache.org + + + UTF-8 + + + + + junit + junit + 4.8.2 + test + + + org.apache.hadoop + hadoop-client + 3.2.0 + + + javax.servlet + servlet-api + + + provided + + + org.apache.spark + spark-core_2.13 + 3.2.0 + provided + + + com.google.guava + guava + 11.0.2 + + + com.fasterxml.jackson.core + jackson-databind + 2.12.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.5.1 + + 8 + 8 + + + + org.apache.maven.plugins + maven-shade-plugin + 2.3 + + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + diff --git a/adamic-adar/src/main/java/org/spark/AdamicAdar.java b/adamic-adar/src/main/java/org/spark/AdamicAdar.java new file mode 100644 index 0000000..686e712 --- /dev/null +++ b/adamic-adar/src/main/java/org/spark/AdamicAdar.java @@ -0,0 +1,139 @@ +package org.spark; + +import java.util.*; +import java.util.regex.Pattern; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import scala.Tuple2; + +public class AdamicAdar { + + private static final Pattern SPACE = Pattern.compile("[ \\t\\x0B\\f\\r]+"); + + public static List> adamicAdar(JavaRDD lines, Integer numOfDisplayedScores) { + + //create an RDD of edges containing both (a, b) and (b, a) + JavaPairRDD edges = lines.mapToPair((s) -> { + String[] tokens = SPACE.split(s); + + return new Tuple2<>(tokens[0], tokens[1]); + + }); + + edges.cache(); + + //1) mapToPair + // transform the edges to instances of (node, 1) where "1" corresponds to 1 neighbor + //2) reduceByKey + //create pairs (node, total neighbors) + JavaPairRDD nodesWithNeighborsCount = edges.mapToPair((s) -> { + return (new Tuple2<>(s._1(), 1)); + }).reduceByKey((v1, v2) -> v1 + v2); + + //1) + //join the edges with a copy of themselves to create edges -> + //-> of nodes connected to the neighbors of their neighbors + // output is a set of pairs of (common-neighbor, [node1, node2]) + //2) + //this set of pairs also contain: + // 1. -> pairs of nodes with the type of (a, a) + // 2. -> all the reverse edges of the resulting edges e.g. both (a, b) and (b, a) + // 3. -> pairs of already connected nodes (due to exsiting connections of nodes with common neighbors) + JavaPairRDD> joinedEdges = edges.join(edges); + + //caching + joinedEdges.cache(); + + //1) flatMapToPair + distinct + //remove the reversed edges and the edges of type (a, a) and keep only the distinct values of the result + //2) subtract + //subtract the existing edges from the new filtered result in order to keep the unconnected edges only + JavaPairRDD unconnectedEdgesRaw = joinedEdges.flatMapToPair(s -> { + ArrayList> arrayList = new ArrayList<>(); + if (Integer.parseInt(s._2()._1()) < Integer.parseInt(s._2()._2())) { + arrayList.add(s._2()); + } + return arrayList.iterator(); + }).distinct().subtract(edges); + + + //write the unconnected edges in the form of (node1 node2, -1) to allow the future join action (explained below) + //-1 is dummy info + JavaPairRDD unconnectedEdges = unconnectedEdgesRaw + .mapToPair(s -> new Tuple2<>(s._1() + " <-> " + s._2(), -1)); + + //joinedEdges contain pairs of type (common-neighbor, (node1, node2)) + //filter again the joinedEdges RDD by removing pairs of (a,a) and the reverse edges + JavaPairRDD> filteredJoinedEdges = joinedEdges.filter(s -> { + return Integer.parseInt(s._2()._1()) < Integer.parseInt(s._2()._2()); + }); + + //1) join + //join the filteredJoinedEdges with the nodesWithNeighborsCount to create a set of pairs like below: + //for an edge (a, b) with common neighbors n1, n2 we create pairs of (n1, [(a,b), n1-total-neighbors]), + // (n2, [(a, b), n2-total-neighbors]) + //2) mapToPair + // create a new set of pairs with type (node1 node2, common-neighbor-total-neighbors) + // for the above example it will produce (a b, n1-total-neighbors), (a b, n2-total-neighbors) + //groupByKey + // groupByKey will transform the flatMapToPair output to pairs of type -> + // -> (a b, [n1-total-neighbors, n2-total-neighbors]) + JavaPairRDD> adamicAdarParameters = filteredJoinedEdges.join(nodesWithNeighborsCount) + .mapToPair(s -> { + return new Tuple2<>(s._2()._1()._1() + " <-> " + s._2()._1()._2(), s._2()._2()); + }).groupByKey(); + + //now we are ready to compute adamic adar scores! + //1) join + // the join with the previously computed RDD of unconnected edges of type (unconnected-edge, -1) -> + //-> will let us keep the unconnected edges only in the final result + //2) mapValues + //in the mapValues we compute the AdamicAdar Scores + JavaPairRDD adamicAdarScores = adamicAdarParameters.join(unconnectedEdges).mapValues(v -> { + Iterable countsOfNeighborsOfCommonNeighbors = v._1(); + double adamicAdar = 0; + for (Integer n : countsOfNeighborsOfCommonNeighbors) { + adamicAdar += (1 / Math.log10(n)); + } + return adamicAdar; + }); + + //reverse the resulting tuples for the sortByKey below + JavaPairRDD reversedTuples = adamicAdarScores.mapToPair(s -> { + return new Tuple2<>(s._2(), s._1()); + }); + + //return the sorted result + return reversedTuples.sortByKey(false).take(numOfDisplayedScores); + } + + public static void main(String[] args) throws Exception { + + if (args.length < 2) { + System.err.println("Usage: AdamicAdar "); + System.exit(1); + } + + SparkConf sparkConf = new SparkConf().setAppName("AdamicAdar").setMaster("local[*]"); + JavaSparkContext sc = new JavaSparkContext(sparkConf); + + //exclude the lines containing comments + JavaRDD lines = sc.textFile(args[0]).filter(l -> !l.contains("#")); + + //calculate adamic adar scores for every unconnected edge of two nodes with at least one common neighbor + List> aaScores = adamicAdar(lines, Integer.parseInt(args[1])); + + //print the result + for (Tuple2 score : aaScores) { + System.out.println(String.format("%.5f", score._1()) + ", " + score._2()); + } + + sc.stop(); + + } + +} diff --git a/common-neighbors/.mvn/wrapper/MavenWrapperDownloader.java b/common-neighbors/.mvn/wrapper/MavenWrapperDownloader.java new file mode 100644 index 0000000..e76d1f3 --- /dev/null +++ b/common-neighbors/.mvn/wrapper/MavenWrapperDownloader.java @@ -0,0 +1,117 @@ +/* + * Copyright 2007-present the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.net.*; +import java.io.*; +import java.nio.channels.*; +import java.util.Properties; + +public class MavenWrapperDownloader { + + private static final String WRAPPER_VERSION = "0.5.6"; + /** + * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided. + */ + private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/" + + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar"; + + /** + * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to + * use instead of the default one. + */ + private static final String MAVEN_WRAPPER_PROPERTIES_PATH = + ".mvn/wrapper/maven-wrapper.properties"; + + /** + * Path where the maven-wrapper.jar will be saved to. + */ + private static final String MAVEN_WRAPPER_JAR_PATH = + ".mvn/wrapper/maven-wrapper.jar"; + + /** + * Name of the property which should be used to override the default download url for the wrapper. + */ + private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl"; + + public static void main(String args[]) { + System.out.println("- Downloader started"); + File baseDirectory = new File(args[0]); + System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath()); + + // If the maven-wrapper.properties exists, read it and check if it contains a custom + // wrapperUrl parameter. + File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH); + String url = DEFAULT_DOWNLOAD_URL; + if(mavenWrapperPropertyFile.exists()) { + FileInputStream mavenWrapperPropertyFileInputStream = null; + try { + mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile); + Properties mavenWrapperProperties = new Properties(); + mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream); + url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url); + } catch (IOException e) { + System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'"); + } finally { + try { + if(mavenWrapperPropertyFileInputStream != null) { + mavenWrapperPropertyFileInputStream.close(); + } + } catch (IOException e) { + // Ignore ... + } + } + } + System.out.println("- Downloading from: " + url); + + File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH); + if(!outputFile.getParentFile().exists()) { + if(!outputFile.getParentFile().mkdirs()) { + System.out.println( + "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'"); + } + } + System.out.println("- Downloading to: " + outputFile.getAbsolutePath()); + try { + downloadFileFromURL(url, outputFile); + System.out.println("Done"); + System.exit(0); + } catch (Throwable e) { + System.out.println("- Error downloading"); + e.printStackTrace(); + System.exit(1); + } + } + + private static void downloadFileFromURL(String urlString, File destination) throws Exception { + if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) { + String username = System.getenv("MVNW_USERNAME"); + char[] password = System.getenv("MVNW_PASSWORD").toCharArray(); + Authenticator.setDefault(new Authenticator() { + @Override + protected PasswordAuthentication getPasswordAuthentication() { + return new PasswordAuthentication(username, password); + } + }); + } + URL website = new URL(urlString); + ReadableByteChannel rbc; + rbc = Channels.newChannel(website.openStream()); + FileOutputStream fos = new FileOutputStream(destination); + fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); + fos.close(); + rbc.close(); + } + +} diff --git a/common-neighbors/.mvn/wrapper/maven-wrapper.jar b/common-neighbors/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 0000000..2cc7d4a Binary files /dev/null and b/common-neighbors/.mvn/wrapper/maven-wrapper.jar differ diff --git a/common-neighbors/.mvn/wrapper/maven-wrapper.properties b/common-neighbors/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..ffdc10e --- /dev/null +++ b/common-neighbors/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,2 @@ +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.1/apache-maven-3.8.1-bin.zip +wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar diff --git a/common-neighbors/dependency-reduced-pom.xml b/common-neighbors/dependency-reduced-pom.xml new file mode 100644 index 0000000..b20595e --- /dev/null +++ b/common-neighbors/dependency-reduced-pom.xml @@ -0,0 +1,341 @@ + + + 4.0.0 + org.hua + commonneighbors + CommonNeighbors + 0.1 + http://maven.apache.org + + + + maven-compiler-plugin + 2.5.1 + + 8 + 8 + + + + maven-shade-plugin + 2.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + junit + junit + 4.8.2 + test + + + org.apache.hadoop + hadoop-client + 3.2.0 + provided + + + servlet-api + javax.servlet + + + hadoop-common + org.apache.hadoop + + + hadoop-hdfs-client + org.apache.hadoop + + + hadoop-yarn-api + org.apache.hadoop + + + hadoop-yarn-client + org.apache.hadoop + + + hadoop-mapreduce-client-core + org.apache.hadoop + + + hadoop-mapreduce-client-jobclient + org.apache.hadoop + + + hadoop-annotations + org.apache.hadoop + + + + + org.apache.spark + spark-core_2.13 + 3.2.0 + provided + + + scala-parallel-collections_2.13 + org.scala-lang.modules + + + avro + org.apache.avro + + + avro-mapred + org.apache.avro + + + chill_2.13 + com.twitter + + + chill-java + com.twitter + + + xbean-asm9-shaded + org.apache.xbean + + + hadoop-client-api + org.apache.hadoop + + + hadoop-client-runtime + org.apache.hadoop + + + spark-launcher_2.13 + org.apache.spark + + + spark-kvstore_2.13 + org.apache.spark + + + spark-network-common_2.13 + org.apache.spark + + + spark-network-shuffle_2.13 + org.apache.spark + + + spark-unsafe_2.13 + org.apache.spark + + + activation + javax.activation + + + curator-recipes + org.apache.curator + + + zookeeper + org.apache.zookeeper + + + jakarta.servlet-api + jakarta.servlet + + + commons-codec + commons-codec + + + commons-lang3 + org.apache.commons + + + commons-math3 + org.apache.commons + + + commons-text + org.apache.commons + + + commons-io + commons-io + + + commons-collections + commons-collections + + + slf4j-api + org.slf4j + + + jul-to-slf4j + org.slf4j + + + jcl-over-slf4j + org.slf4j + + + log4j + log4j + + + slf4j-log4j12 + org.slf4j + + + compress-lzf + com.ning + + + snappy-java + org.xerial.snappy + + + lz4-java + org.lz4 + + + zstd-jni + com.github.luben + + + RoaringBitmap + org.roaringbitmap + + + commons-net + commons-net + + + scala-xml_2.13 + org.scala-lang.modules + + + scala-library + org.scala-lang + + + scala-reflect + org.scala-lang + + + json4s-jackson_2.13 + org.json4s + + + jersey-client + org.glassfish.jersey.core + + + jersey-common + org.glassfish.jersey.core + + + jersey-server + org.glassfish.jersey.core + + + jersey-container-servlet + org.glassfish.jersey.containers + + + jersey-container-servlet-core + org.glassfish.jersey.containers + + + jersey-hk2 + org.glassfish.jersey.inject + + + netty-all + io.netty + + + stream + com.clearspring.analytics + + + metrics-core + io.dropwizard.metrics + + + metrics-jvm + io.dropwizard.metrics + + + metrics-json + io.dropwizard.metrics + + + metrics-graphite + io.dropwizard.metrics + + + metrics-jmx + io.dropwizard.metrics + + + jackson-module-scala_2.13 + com.fasterxml.jackson.module + + + ivy + org.apache.ivy + + + oro + oro + + + pyrolite + net.razorvine + + + py4j + net.sf.py4j + + + spark-tags_2.13 + org.apache.spark + + + commons-crypto + org.apache.commons + + + unused + org.spark-project.spark + + + + + + UTF-8 + + + diff --git a/common-neighbors/mvnw b/common-neighbors/mvnw new file mode 100755 index 0000000..a16b543 --- /dev/null +++ b/common-neighbors/mvnw @@ -0,0 +1,310 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Mingw, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`which java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +########################################################################################## +# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central +# This allows using the maven wrapper in projects that prohibit checking in binary data. +########################################################################################## +if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found .mvn/wrapper/maven-wrapper.jar" + fi +else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." + fi + if [ -n "$MVNW_REPOURL" ]; then + jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + else + jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + fi + while IFS="=" read key value; do + case "$key" in (wrapperUrl) jarUrl="$value"; break ;; + esac + done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" + if [ "$MVNW_VERBOSE" = true ]; then + echo "Downloading from: $jarUrl" + fi + wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" + if $cygwin; then + wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` + fi + + if command -v wget > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found wget ... using wget" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + wget "$jarUrl" -O "$wrapperJarPath" + else + wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" + fi + elif command -v curl > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found curl ... using curl" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + curl -o "$wrapperJarPath" "$jarUrl" -f + else + curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f + fi + + else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Falling back to using Java to download" + fi + javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" + # For Cygwin, switch paths to Windows format before running javac + if $cygwin; then + javaClass=`cygpath --path --windows "$javaClass"` + fi + if [ -e "$javaClass" ]; then + if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Compiling MavenWrapperDownloader.java ..." + fi + # Compiling the Java class + ("$JAVA_HOME/bin/javac" "$javaClass") + fi + if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + # Running the downloader + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Running MavenWrapperDownloader.java ..." + fi + ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") + fi + fi + fi +fi +########################################################################################## +# End of extension +########################################################################################## + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +if [ "$MVNW_VERBOSE" = true ]; then + echo $MAVEN_PROJECTBASEDIR +fi +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +# Provide a "standardized" way to retrieve the CLI args that will +# work with both Windows and non-Windows executions. +MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" +export MAVEN_CMD_LINE_ARGS + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/common-neighbors/pom.xml b/common-neighbors/pom.xml new file mode 100644 index 0000000..9d79c88 --- /dev/null +++ b/common-neighbors/pom.xml @@ -0,0 +1,95 @@ + + 4.0.0 + + org.hua + commonneighbors + 0.1 + jar + + CommonNeighbors + http://maven.apache.org + + + UTF-8 + + + + + junit + junit + 4.8.2 + test + + + org.apache.hadoop + hadoop-client + 3.2.0 + + + javax.servlet + servlet-api + + + provided + + + org.apache.spark + spark-core_2.13 + 3.2.0 + provided + + + com.google.guava + guava + 11.0.2 + + + com.fasterxml.jackson.core + jackson-databind + 2.12.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.5.1 + + 8 + 8 + + + + org.apache.maven.plugins + maven-shade-plugin + 2.3 + + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + diff --git a/common-neighbors/src/main/java/org/spark/CommonNeighbors.java b/common-neighbors/src/main/java/org/spark/CommonNeighbors.java new file mode 100644 index 0000000..e3d2cda --- /dev/null +++ b/common-neighbors/src/main/java/org/spark/CommonNeighbors.java @@ -0,0 +1,105 @@ +package org.spark; + +import java.util.*; +import java.util.regex.Pattern; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import scala.Tuple2; + +public class CommonNeighbors { + + private static final Pattern SPACE = Pattern.compile("[ \\t\\x0B\\f\\r]+"); + + public static List> commonNeighbors(JavaRDD lines, Integer numOfDisplayedScores) { + + //create an RDD of edges containing both (a, b) and (b, a) + //also ignore lines with comments + JavaPairRDD edges = lines.flatMapToPair(s -> { + String[] tokens = s.split(SPACE.pattern()); + ArrayList> arrayList = new ArrayList<>(); + if (!s.contains("#")) { + arrayList.add(new Tuple2<>(tokens[0], tokens[1])); + } + return arrayList.iterator(); + }); + + //cache the result + edges.cache(); + + //1) + //join the edges with a copy of themselves to create edges -> + //-> of nodes connected to the neighbors of their neighbors + // output is a set of pairs of (common-neighbor, [node1, node2]) + //2) + //this is not yet the desired outcome because the result will also contain: + // 1. -> pairs of nodes with the type of (a, a) + // 2. -> all the reverse edges of the resulting edges e.g. both (a, b) and (b, a) + // 3. -> pairs of already connected nodes (due to exsiting connections of nodes with common neighbors) + JavaPairRDD> joinedEdges = edges.join(edges); + + //filter the previous result by removing all the (a, a) pairs + //remove also the reverse edges by keeping only (a,b) where a < b + JavaPairRDD tempResults = joinedEdges.flatMapToPair(s -> { + ArrayList> arrayList = new ArrayList<>(); + if (Integer.parseInt(s._2()._1()) < Integer.parseInt(s._2()._2())) { + arrayList.add(s._2()); + } + return arrayList.iterator(); + }); + + //1) subtract + //subtract the existing edges of the graph from the filtered remaining edges of the previous result -> + //-> in order to keep the edges of unconnected nodes only + //2) mapToPair + // every instance of a pair of unconnected nodes is equal to a common neighbor of them + // Because of that, map every instance with value "1" for the future reduction + //3) + // result will now contain pairs of (a<->b, 1) + JavaPairRDD unconnectedEdgeInstances = tempResults.subtract(edges).mapToPair(s -> { + return new Tuple2(s._1() + " <-> " + s._2(), 1); + }); + + //1) reduceByKey + //this is the final result containing pairs of (edge-of-unconnected-nodes, number-of-common-neighbors) + JavaPairRDD scores = unconnectedEdgeInstances.reduceByKey((v1, v2) -> v1 + v2); + + //reverse the resulting tuples a for the future sortByKey + JavaPairRDD reversedTuples = scores.mapToPair(s -> { + return new Tuple2<>(s._2(), s._1()); + }); + + //return the top scores of the sorted results + //numOfDisplayedScores defines the number of scores to return + return reversedTuples.sortByKey(false).take(numOfDisplayedScores); + } + + + public static void main(String[] args) throws Exception { + + if (args.length < 2) { + System.err.println("Usage: Arguments must be "); + System.exit(1); + } + + SparkConf sparkConf = new SparkConf().setAppName("Common Neighbors").setMaster("local[*]"); + JavaSparkContext sc = new JavaSparkContext(sparkConf); + + JavaRDD lines = sc.textFile(args[0]); + + //calculate common neighbors metric + List> cnScores = commonNeighbors(lines, Integer.parseInt(args[1])); + + //print the result + for (Tuple2 score : cnScores) { + System.out.println(score); + } + + sc.stop(); + + } + +} diff --git a/jaccard-coefficient/.mvn/wrapper/MavenWrapperDownloader.java b/jaccard-coefficient/.mvn/wrapper/MavenWrapperDownloader.java new file mode 100644 index 0000000..e76d1f3 --- /dev/null +++ b/jaccard-coefficient/.mvn/wrapper/MavenWrapperDownloader.java @@ -0,0 +1,117 @@ +/* + * Copyright 2007-present the original author or authors. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * https://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +import java.net.*; +import java.io.*; +import java.nio.channels.*; +import java.util.Properties; + +public class MavenWrapperDownloader { + + private static final String WRAPPER_VERSION = "0.5.6"; + /** + * Default URL to download the maven-wrapper.jar from, if no 'downloadUrl' is provided. + */ + private static final String DEFAULT_DOWNLOAD_URL = "https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/" + + WRAPPER_VERSION + "/maven-wrapper-" + WRAPPER_VERSION + ".jar"; + + /** + * Path to the maven-wrapper.properties file, which might contain a downloadUrl property to + * use instead of the default one. + */ + private static final String MAVEN_WRAPPER_PROPERTIES_PATH = + ".mvn/wrapper/maven-wrapper.properties"; + + /** + * Path where the maven-wrapper.jar will be saved to. + */ + private static final String MAVEN_WRAPPER_JAR_PATH = + ".mvn/wrapper/maven-wrapper.jar"; + + /** + * Name of the property which should be used to override the default download url for the wrapper. + */ + private static final String PROPERTY_NAME_WRAPPER_URL = "wrapperUrl"; + + public static void main(String args[]) { + System.out.println("- Downloader started"); + File baseDirectory = new File(args[0]); + System.out.println("- Using base directory: " + baseDirectory.getAbsolutePath()); + + // If the maven-wrapper.properties exists, read it and check if it contains a custom + // wrapperUrl parameter. + File mavenWrapperPropertyFile = new File(baseDirectory, MAVEN_WRAPPER_PROPERTIES_PATH); + String url = DEFAULT_DOWNLOAD_URL; + if(mavenWrapperPropertyFile.exists()) { + FileInputStream mavenWrapperPropertyFileInputStream = null; + try { + mavenWrapperPropertyFileInputStream = new FileInputStream(mavenWrapperPropertyFile); + Properties mavenWrapperProperties = new Properties(); + mavenWrapperProperties.load(mavenWrapperPropertyFileInputStream); + url = mavenWrapperProperties.getProperty(PROPERTY_NAME_WRAPPER_URL, url); + } catch (IOException e) { + System.out.println("- ERROR loading '" + MAVEN_WRAPPER_PROPERTIES_PATH + "'"); + } finally { + try { + if(mavenWrapperPropertyFileInputStream != null) { + mavenWrapperPropertyFileInputStream.close(); + } + } catch (IOException e) { + // Ignore ... + } + } + } + System.out.println("- Downloading from: " + url); + + File outputFile = new File(baseDirectory.getAbsolutePath(), MAVEN_WRAPPER_JAR_PATH); + if(!outputFile.getParentFile().exists()) { + if(!outputFile.getParentFile().mkdirs()) { + System.out.println( + "- ERROR creating output directory '" + outputFile.getParentFile().getAbsolutePath() + "'"); + } + } + System.out.println("- Downloading to: " + outputFile.getAbsolutePath()); + try { + downloadFileFromURL(url, outputFile); + System.out.println("Done"); + System.exit(0); + } catch (Throwable e) { + System.out.println("- Error downloading"); + e.printStackTrace(); + System.exit(1); + } + } + + private static void downloadFileFromURL(String urlString, File destination) throws Exception { + if (System.getenv("MVNW_USERNAME") != null && System.getenv("MVNW_PASSWORD") != null) { + String username = System.getenv("MVNW_USERNAME"); + char[] password = System.getenv("MVNW_PASSWORD").toCharArray(); + Authenticator.setDefault(new Authenticator() { + @Override + protected PasswordAuthentication getPasswordAuthentication() { + return new PasswordAuthentication(username, password); + } + }); + } + URL website = new URL(urlString); + ReadableByteChannel rbc; + rbc = Channels.newChannel(website.openStream()); + FileOutputStream fos = new FileOutputStream(destination); + fos.getChannel().transferFrom(rbc, 0, Long.MAX_VALUE); + fos.close(); + rbc.close(); + } + +} diff --git a/jaccard-coefficient/.mvn/wrapper/maven-wrapper.jar b/jaccard-coefficient/.mvn/wrapper/maven-wrapper.jar new file mode 100644 index 0000000..2cc7d4a Binary files /dev/null and b/jaccard-coefficient/.mvn/wrapper/maven-wrapper.jar differ diff --git a/jaccard-coefficient/.mvn/wrapper/maven-wrapper.properties b/jaccard-coefficient/.mvn/wrapper/maven-wrapper.properties new file mode 100644 index 0000000..ffdc10e --- /dev/null +++ b/jaccard-coefficient/.mvn/wrapper/maven-wrapper.properties @@ -0,0 +1,2 @@ +distributionUrl=https://repo.maven.apache.org/maven2/org/apache/maven/apache-maven/3.8.1/apache-maven-3.8.1-bin.zip +wrapperUrl=https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar diff --git a/jaccard-coefficient/dependency-reduced-pom.xml b/jaccard-coefficient/dependency-reduced-pom.xml new file mode 100644 index 0000000..a9aa905 --- /dev/null +++ b/jaccard-coefficient/dependency-reduced-pom.xml @@ -0,0 +1,341 @@ + + + 4.0.0 + org.hua + jaccardcoefficient + JaccardCoefficient + 0.1 + http://maven.apache.org + + + + maven-compiler-plugin + 2.5.1 + + 8 + 8 + + + + maven-shade-plugin + 2.3 + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + junit + junit + 4.8.2 + test + + + org.apache.hadoop + hadoop-client + 3.2.0 + provided + + + servlet-api + javax.servlet + + + hadoop-common + org.apache.hadoop + + + hadoop-hdfs-client + org.apache.hadoop + + + hadoop-yarn-api + org.apache.hadoop + + + hadoop-yarn-client + org.apache.hadoop + + + hadoop-mapreduce-client-core + org.apache.hadoop + + + hadoop-mapreduce-client-jobclient + org.apache.hadoop + + + hadoop-annotations + org.apache.hadoop + + + + + org.apache.spark + spark-core_2.13 + 3.2.0 + provided + + + scala-parallel-collections_2.13 + org.scala-lang.modules + + + avro + org.apache.avro + + + avro-mapred + org.apache.avro + + + chill_2.13 + com.twitter + + + chill-java + com.twitter + + + xbean-asm9-shaded + org.apache.xbean + + + hadoop-client-api + org.apache.hadoop + + + hadoop-client-runtime + org.apache.hadoop + + + spark-launcher_2.13 + org.apache.spark + + + spark-kvstore_2.13 + org.apache.spark + + + spark-network-common_2.13 + org.apache.spark + + + spark-network-shuffle_2.13 + org.apache.spark + + + spark-unsafe_2.13 + org.apache.spark + + + activation + javax.activation + + + curator-recipes + org.apache.curator + + + zookeeper + org.apache.zookeeper + + + jakarta.servlet-api + jakarta.servlet + + + commons-codec + commons-codec + + + commons-lang3 + org.apache.commons + + + commons-math3 + org.apache.commons + + + commons-text + org.apache.commons + + + commons-io + commons-io + + + commons-collections + commons-collections + + + slf4j-api + org.slf4j + + + jul-to-slf4j + org.slf4j + + + jcl-over-slf4j + org.slf4j + + + log4j + log4j + + + slf4j-log4j12 + org.slf4j + + + compress-lzf + com.ning + + + snappy-java + org.xerial.snappy + + + lz4-java + org.lz4 + + + zstd-jni + com.github.luben + + + RoaringBitmap + org.roaringbitmap + + + commons-net + commons-net + + + scala-xml_2.13 + org.scala-lang.modules + + + scala-library + org.scala-lang + + + scala-reflect + org.scala-lang + + + json4s-jackson_2.13 + org.json4s + + + jersey-client + org.glassfish.jersey.core + + + jersey-common + org.glassfish.jersey.core + + + jersey-server + org.glassfish.jersey.core + + + jersey-container-servlet + org.glassfish.jersey.containers + + + jersey-container-servlet-core + org.glassfish.jersey.containers + + + jersey-hk2 + org.glassfish.jersey.inject + + + netty-all + io.netty + + + stream + com.clearspring.analytics + + + metrics-core + io.dropwizard.metrics + + + metrics-jvm + io.dropwizard.metrics + + + metrics-json + io.dropwizard.metrics + + + metrics-graphite + io.dropwizard.metrics + + + metrics-jmx + io.dropwizard.metrics + + + jackson-module-scala_2.13 + com.fasterxml.jackson.module + + + ivy + org.apache.ivy + + + oro + oro + + + pyrolite + net.razorvine + + + py4j + net.sf.py4j + + + spark-tags_2.13 + org.apache.spark + + + commons-crypto + org.apache.commons + + + unused + org.spark-project.spark + + + + + + UTF-8 + + + diff --git a/jaccard-coefficient/mvnw b/jaccard-coefficient/mvnw new file mode 100755 index 0000000..a16b543 --- /dev/null +++ b/jaccard-coefficient/mvnw @@ -0,0 +1,310 @@ +#!/bin/sh +# ---------------------------------------------------------------------------- +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# ---------------------------------------------------------------------------- + +# ---------------------------------------------------------------------------- +# Maven Start Up Batch script +# +# Required ENV vars: +# ------------------ +# JAVA_HOME - location of a JDK home dir +# +# Optional ENV vars +# ----------------- +# M2_HOME - location of maven2's installed home dir +# MAVEN_OPTS - parameters passed to the Java VM when running Maven +# e.g. to debug Maven itself, use +# set MAVEN_OPTS=-Xdebug -Xrunjdwp:transport=dt_socket,server=y,suspend=y,address=8000 +# MAVEN_SKIP_RC - flag to disable loading of mavenrc files +# ---------------------------------------------------------------------------- + +if [ -z "$MAVEN_SKIP_RC" ] ; then + + if [ -f /etc/mavenrc ] ; then + . /etc/mavenrc + fi + + if [ -f "$HOME/.mavenrc" ] ; then + . "$HOME/.mavenrc" + fi + +fi + +# OS specific support. $var _must_ be set to either true or false. +cygwin=false; +darwin=false; +mingw=false +case "`uname`" in + CYGWIN*) cygwin=true ;; + MINGW*) mingw=true;; + Darwin*) darwin=true + # Use /usr/libexec/java_home if available, otherwise fall back to /Library/Java/Home + # See https://developer.apple.com/library/mac/qa/qa1170/_index.html + if [ -z "$JAVA_HOME" ]; then + if [ -x "/usr/libexec/java_home" ]; then + export JAVA_HOME="`/usr/libexec/java_home`" + else + export JAVA_HOME="/Library/Java/Home" + fi + fi + ;; +esac + +if [ -z "$JAVA_HOME" ] ; then + if [ -r /etc/gentoo-release ] ; then + JAVA_HOME=`java-config --jre-home` + fi +fi + +if [ -z "$M2_HOME" ] ; then + ## resolve links - $0 may be a link to maven's home + PRG="$0" + + # need this for relative symlinks + while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG="`dirname "$PRG"`/$link" + fi + done + + saveddir=`pwd` + + M2_HOME=`dirname "$PRG"`/.. + + # make it fully qualified + M2_HOME=`cd "$M2_HOME" && pwd` + + cd "$saveddir" + # echo Using m2 at $M2_HOME +fi + +# For Cygwin, ensure paths are in UNIX format before anything is touched +if $cygwin ; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --unix "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --unix "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --unix "$CLASSPATH"` +fi + +# For Mingw, ensure paths are in UNIX format before anything is touched +if $mingw ; then + [ -n "$M2_HOME" ] && + M2_HOME="`(cd "$M2_HOME"; pwd)`" + [ -n "$JAVA_HOME" ] && + JAVA_HOME="`(cd "$JAVA_HOME"; pwd)`" +fi + +if [ -z "$JAVA_HOME" ]; then + javaExecutable="`which javac`" + if [ -n "$javaExecutable" ] && ! [ "`expr \"$javaExecutable\" : '\([^ ]*\)'`" = "no" ]; then + # readlink(1) is not available as standard on Solaris 10. + readLink=`which readlink` + if [ ! `expr "$readLink" : '\([^ ]*\)'` = "no" ]; then + if $darwin ; then + javaHome="`dirname \"$javaExecutable\"`" + javaExecutable="`cd \"$javaHome\" && pwd -P`/javac" + else + javaExecutable="`readlink -f \"$javaExecutable\"`" + fi + javaHome="`dirname \"$javaExecutable\"`" + javaHome=`expr "$javaHome" : '\(.*\)/bin'` + JAVA_HOME="$javaHome" + export JAVA_HOME + fi + fi +fi + +if [ -z "$JAVACMD" ] ; then + if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD="$JAVA_HOME/jre/sh/java" + else + JAVACMD="$JAVA_HOME/bin/java" + fi + else + JAVACMD="`which java`" + fi +fi + +if [ ! -x "$JAVACMD" ] ; then + echo "Error: JAVA_HOME is not defined correctly." >&2 + echo " We cannot execute $JAVACMD" >&2 + exit 1 +fi + +if [ -z "$JAVA_HOME" ] ; then + echo "Warning: JAVA_HOME environment variable is not set." +fi + +CLASSWORLDS_LAUNCHER=org.codehaus.plexus.classworlds.launcher.Launcher + +# traverses directory structure from process work directory to filesystem root +# first directory with .mvn subdirectory is considered project base directory +find_maven_basedir() { + + if [ -z "$1" ] + then + echo "Path not specified to find_maven_basedir" + return 1 + fi + + basedir="$1" + wdir="$1" + while [ "$wdir" != '/' ] ; do + if [ -d "$wdir"/.mvn ] ; then + basedir=$wdir + break + fi + # workaround for JBEAP-8937 (on Solaris 10/Sparc) + if [ -d "${wdir}" ]; then + wdir=`cd "$wdir/.."; pwd` + fi + # end of workaround + done + echo "${basedir}" +} + +# concatenates all lines of a file +concat_lines() { + if [ -f "$1" ]; then + echo "$(tr -s '\n' ' ' < "$1")" + fi +} + +BASE_DIR=`find_maven_basedir "$(pwd)"` +if [ -z "$BASE_DIR" ]; then + exit 1; +fi + +########################################################################################## +# Extension to allow automatically downloading the maven-wrapper.jar from Maven-central +# This allows using the maven wrapper in projects that prohibit checking in binary data. +########################################################################################## +if [ -r "$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found .mvn/wrapper/maven-wrapper.jar" + fi +else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Couldn't find .mvn/wrapper/maven-wrapper.jar, downloading it ..." + fi + if [ -n "$MVNW_REPOURL" ]; then + jarUrl="$MVNW_REPOURL/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + else + jarUrl="https://repo.maven.apache.org/maven2/io/takari/maven-wrapper/0.5.6/maven-wrapper-0.5.6.jar" + fi + while IFS="=" read key value; do + case "$key" in (wrapperUrl) jarUrl="$value"; break ;; + esac + done < "$BASE_DIR/.mvn/wrapper/maven-wrapper.properties" + if [ "$MVNW_VERBOSE" = true ]; then + echo "Downloading from: $jarUrl" + fi + wrapperJarPath="$BASE_DIR/.mvn/wrapper/maven-wrapper.jar" + if $cygwin; then + wrapperJarPath=`cygpath --path --windows "$wrapperJarPath"` + fi + + if command -v wget > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found wget ... using wget" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + wget "$jarUrl" -O "$wrapperJarPath" + else + wget --http-user=$MVNW_USERNAME --http-password=$MVNW_PASSWORD "$jarUrl" -O "$wrapperJarPath" + fi + elif command -v curl > /dev/null; then + if [ "$MVNW_VERBOSE" = true ]; then + echo "Found curl ... using curl" + fi + if [ -z "$MVNW_USERNAME" ] || [ -z "$MVNW_PASSWORD" ]; then + curl -o "$wrapperJarPath" "$jarUrl" -f + else + curl --user $MVNW_USERNAME:$MVNW_PASSWORD -o "$wrapperJarPath" "$jarUrl" -f + fi + + else + if [ "$MVNW_VERBOSE" = true ]; then + echo "Falling back to using Java to download" + fi + javaClass="$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.java" + # For Cygwin, switch paths to Windows format before running javac + if $cygwin; then + javaClass=`cygpath --path --windows "$javaClass"` + fi + if [ -e "$javaClass" ]; then + if [ ! -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Compiling MavenWrapperDownloader.java ..." + fi + # Compiling the Java class + ("$JAVA_HOME/bin/javac" "$javaClass") + fi + if [ -e "$BASE_DIR/.mvn/wrapper/MavenWrapperDownloader.class" ]; then + # Running the downloader + if [ "$MVNW_VERBOSE" = true ]; then + echo " - Running MavenWrapperDownloader.java ..." + fi + ("$JAVA_HOME/bin/java" -cp .mvn/wrapper MavenWrapperDownloader "$MAVEN_PROJECTBASEDIR") + fi + fi + fi +fi +########################################################################################## +# End of extension +########################################################################################## + +export MAVEN_PROJECTBASEDIR=${MAVEN_BASEDIR:-"$BASE_DIR"} +if [ "$MVNW_VERBOSE" = true ]; then + echo $MAVEN_PROJECTBASEDIR +fi +MAVEN_OPTS="$(concat_lines "$MAVEN_PROJECTBASEDIR/.mvn/jvm.config") $MAVEN_OPTS" + +# For Cygwin, switch paths to Windows format before running java +if $cygwin; then + [ -n "$M2_HOME" ] && + M2_HOME=`cygpath --path --windows "$M2_HOME"` + [ -n "$JAVA_HOME" ] && + JAVA_HOME=`cygpath --path --windows "$JAVA_HOME"` + [ -n "$CLASSPATH" ] && + CLASSPATH=`cygpath --path --windows "$CLASSPATH"` + [ -n "$MAVEN_PROJECTBASEDIR" ] && + MAVEN_PROJECTBASEDIR=`cygpath --path --windows "$MAVEN_PROJECTBASEDIR"` +fi + +# Provide a "standardized" way to retrieve the CLI args that will +# work with both Windows and non-Windows executions. +MAVEN_CMD_LINE_ARGS="$MAVEN_CONFIG $@" +export MAVEN_CMD_LINE_ARGS + +WRAPPER_LAUNCHER=org.apache.maven.wrapper.MavenWrapperMain + +exec "$JAVACMD" \ + $MAVEN_OPTS \ + -classpath "$MAVEN_PROJECTBASEDIR/.mvn/wrapper/maven-wrapper.jar" \ + "-Dmaven.home=${M2_HOME}" "-Dmaven.multiModuleProjectDirectory=${MAVEN_PROJECTBASEDIR}" \ + ${WRAPPER_LAUNCHER} $MAVEN_CONFIG "$@" diff --git a/jaccard-coefficient/pom.xml b/jaccard-coefficient/pom.xml new file mode 100644 index 0000000..b4cb97e --- /dev/null +++ b/jaccard-coefficient/pom.xml @@ -0,0 +1,95 @@ + + 4.0.0 + + org.hua + jaccardcoefficient + 0.1 + jar + + JaccardCoefficient + http://maven.apache.org + + + UTF-8 + + + + + junit + junit + 4.8.2 + test + + + org.apache.hadoop + hadoop-client + 3.2.0 + + + javax.servlet + servlet-api + + + provided + + + org.apache.spark + spark-core_2.13 + 3.2.0 + provided + + + com.google.guava + guava + 11.0.2 + + + com.fasterxml.jackson.core + jackson-databind + 2.12.0 + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 2.5.1 + + 8 + 8 + + + + org.apache.maven.plugins + maven-shade-plugin + 2.3 + + + + package + + shade + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + + + + + + diff --git a/jaccard-coefficient/src/main/java/org/spark/JaccardCoefficient.java b/jaccard-coefficient/src/main/java/org/spark/JaccardCoefficient.java new file mode 100644 index 0000000..4955d6b --- /dev/null +++ b/jaccard-coefficient/src/main/java/org/spark/JaccardCoefficient.java @@ -0,0 +1,158 @@ +package org.spark; + +import java.util.*; +import java.util.regex.Pattern; + +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaRDD; +import org.apache.spark.api.java.JavaSparkContext; + +import scala.Tuple2; + +public class JaccardCoefficient { + + private static final Pattern SPACE = Pattern.compile("[ \\t\\x0B\\f\\r]+"); + + public static JavaPairRDD findCommonNeighborsScores(JavaPairRDD graphEdges, JavaPairRDD> joinedEdges) { + + //filter the previous result by removing all the (a, a) pairs + //remove also the reverse edges by keeping only (a,b) where a < b + JavaPairRDD tempResults = joinedEdges.flatMapToPair(s -> { + ArrayList> arrayList = new ArrayList<>(); + if (Integer.parseInt(s._2()._1()) < Integer.parseInt(s._2()._2())) { + arrayList.add(s._2()); + } + return arrayList.iterator(); + }); + + //1) subtract + //subtract the existing edges of the graph from the filtered remaining edges of the previous result -> + //-> in order to keep the edges of unconnected nodes only + //2) mapToPair + // every instance of a pair of unconnected nodes is equal to a common neighbor of them + // Because of that, map every instance with value "1" for the future reduction + //3) + // result will now contain pairs of (a<->b, 1) + JavaPairRDD unconnectedEdgeInstances = tempResults.subtract(graphEdges).mapToPair(s -> { + return new Tuple2(s._1() + " <-> " + s._2(), 1); + }); + + //1) reduceByKey + //this is the final result containing pairs of (edge-of-unconnected-nodes, number-of-common-neighbors) + return unconnectedEdgeInstances.reduceByKey((v1, v2) -> v1 + v2); + } + + + public static List> jaccardCoefficient(JavaRDD lines, Integer numOfDisplayedScores) { + + //create an RDD of edges containing both (a, b) and (b, a) + //also ignore lines with comments + JavaPairRDD edges = lines.flatMapToPair(s -> { + String[] tokens = s.split(SPACE.pattern()); + ArrayList> arrayList = new ArrayList<>(); + if (!s.contains("#")) { + arrayList.add(new Tuple2<>(tokens[0], tokens[1])); + } + return arrayList.iterator(); + }); + + edges.cache(); + + //1) + //join the edges with a copy of themselves to create edges -> + //-> of nodes connected to the neighbors of their neighbors + // output is a set of pairs of (common-neighbor, [node1, node2]) + //2) + //this is not yet the desired outcome because the result will also contain: + // 1. -> pairs of nodes with the type of (a, a) + // 2. -> all the reverse edges of the resulting edges e.g. both (a, b) and (b, a) + // 3. -> pairs of already connected nodes (due to exsiting connections of nodes with common neighbors) + JavaPairRDD> joinedEdges = edges.join(edges); + + + //1) flatMapToPair + // transform the edges to instances of (node, 1) where "1" corresponds to 1 neighbor + //2) reduceByKey + //create pairs (node, total neighbors) + JavaPairRDD nodesWithNeighborsCount = edges.mapToPair((s) -> { + return (new Tuple2<>(s._1(), 1)); + }).reduceByKey((v1, v2) -> v1 + v2); + + //1. transform the pairs of type (common-neighbor, [node1, node2]) to pairs of (node1, node2) + //2. exclude pairs of type (a, a) and keep only the distinct values of the result + JavaPairRDD tempResultsPart1 = joinedEdges.flatMapToPair(s -> { + ArrayList> arrayList = new ArrayList<>(); + if (Integer.parseInt(s._2()._1()) != Integer.parseInt(s._2()._2())) { + arrayList.add(s._2()); + } + return arrayList.iterator(); + }).distinct(); + + //subtract + //subtract the existing edges of the graph from the filtered remaining edges of the previous result -> + //-> in order to keep the edges of unconnected nodes only + //join + //join the result with the rdd of pairs (node, total-neighbors). The outcome pairs contain both -> + // -> (node1, (node2, total-neighbors-of-node1) and (node2, (node1, total-neighbors-of-node2) + JavaPairRDD> tempResultPart2 = tempResultsPart1.subtract(edges).join(nodesWithNeighborsCount); + + //1) mapToPair + //we transform the previous result to pairs of (node1<->node2, total-neighbors-of-node1) + //for the reverse edges we create (node1<->node2, total-neighbors-of-node2) + //2) reduceByKey + //we reduce the instances to the final result of pairs of (node1 <->node2, sum-of-their-neighbors) + JavaPairRDD totalNeighbors = tempResultPart2.mapToPair(s -> { + if (Integer.parseInt(s._1()) < Integer.parseInt(s._2()._1())) { + return new Tuple2<>(s._1() + " <-> " + s._2()._1(), s._2()._2()); + } else { + return new Tuple2<>(s._2()._1() + " <-> " + s._1(), s._2()._2()); + } + }).reduceByKey((v1, v2) -> v1 + v2); + + //find the common neighbors of the unconnected edges + JavaPairRDD commonNeighbors = findCommonNeighborsScores(edges, joinedEdges); + + //join the common neighbors result with the union of neighbors result + //result is a set of pairs of type (unconnected-edge, (number-of-common-neighbors, sum-of-the-nodes-neighbors)) + JavaPairRDD> jaccardCoefficientTempResult = commonNeighbors.join(totalNeighbors); + + //calculate the jaccard coefficient + //result now is a set of pairs of type (unconnected-edge, jaccard-coefficient) + JavaPairRDD jaccardCoefficientScores = jaccardCoefficientTempResult.mapValues(v -> (double) v._1() / (double) (v._2() - v._1())); + + //reverse the resulting tuples for the future sortByKey + JavaPairRDD reversedTuples = jaccardCoefficientScores.mapToPair(s -> { + return new Tuple2<>(s._2(), s._1()); + }); + + //return the sorted result + return reversedTuples.sortByKey(false).take(numOfDisplayedScores); + } + + public static void main(String[] args) throws Exception { + + if (args.length < 2) { + System.err.println("Usage: Arguments must be "); + System.exit(1); + } + + SparkConf sparkConf = new SparkConf().setAppName("JaccardCoefficient").setMaster("local[*]"); + JavaSparkContext sc = new JavaSparkContext(sparkConf); + + JavaRDD lines = sc.textFile(args[0]); + + //calculate the top jaccardCoefficient scores for every unconnected edge -> + //-> of two nodes with at least one common neighbor + List> jcScores = jaccardCoefficient(lines, Integer.parseInt(args[1])); + + //print the results + for (Tuple2 score : jcScores) { + System.out.println(String.format("%.5f", score._1()) + ", " + score._2()); + } + + sc.stop(); + + } + +}