diff --git a/.travis.yml b/.travis.yml
index d866439e2..2130674ed 100644
--- a/.travis.yml
+++ b/.travis.yml
@@ -1,5 +1,6 @@
language: java
jdk: oraclejdk8
+install: mvn install -DskipTests=true -Dgpg.skip=true -Dmaven.javadoc.skip=true -B -V
notifications:
slack: stanford-futuredata:qmO6Keu8ifOyXHsmSQ97CeLH
after_success:
diff --git a/assembly/pom.xml b/assembly/pom.xml
index f8cd84746..5f48f25b5 100644
--- a/assembly/pom.xml
+++ b/assembly/pom.xml
@@ -2,7 +2,7 @@
4.0.0
- macrobase
+ edu.stanford.futuredata
macrobase
0.1-SNAPSHOT
@@ -12,17 +12,17 @@
- macrobase
- macrobase-core
+ edu.stanford.futuredata
+ macrobase-legacy
0.1-SNAPSHOT
- macrobase
+ edu.stanford.futuredata
macrobase-frontend
0.1-SNAPSHOT
- macrobase
+ edu.stanford.futuredata
macrobase-contrib
0.1-SNAPSHOT
diff --git a/bin/batch.sh b/bin/batch.sh
index 758bbb63f..ad0ca20a6 100755
--- a/bin/batch.sh
+++ b/bin/batch.sh
@@ -2,4 +2,4 @@
conf_file=${1:-"conf/batch.yaml"}
set -x
-java ${JAVA_OPTS} -cp "core/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file
+java ${JAVA_OPTS} -cp "legacy/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file
diff --git a/bin/server.sh b/bin/server.sh
index 11fba3ee1..a663ec786 100755
--- a/bin/server.sh
+++ b/bin/server.sh
@@ -3,4 +3,4 @@ conf_file=${1:-"conf/macrobase.yaml"}
set -x
-java ${JAVA_OPTS} -cp "core/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.runtime.MacroBaseServer server $conf_file
+java ${JAVA_OPTS} -cp "legacy/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.runtime.MacroBaseServer server $conf_file
diff --git a/bin/streaming.sh b/bin/streaming.sh
index db2576767..16c89b880 100755
--- a/bin/streaming.sh
+++ b/bin/streaming.sh
@@ -2,4 +2,4 @@
conf_file=${1:-"conf/streaming.yaml"}
set -x
-java ${JAVA_OPTS} -cp "core/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file
+java ${JAVA_OPTS} -cp "legacy/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file
diff --git a/contrib/pom.xml b/contrib/pom.xml
index a87448244..f0b42c04a 100644
--- a/contrib/pom.xml
+++ b/contrib/pom.xml
@@ -1,22 +1,22 @@
4.0.0
- macrobase
+ edu.stanford.futuredata
macrobase-contrib
jar
0.1-SNAPSHOT
macrobase-contrib
http://maven.apache.org
- macrobase
+ edu.stanford.futuredata
macrobase
0.1-SNAPSHOT
- macrobase
- macrobase-core
+ edu.stanford.futuredata
+ macrobase-legacy
0.1-SNAPSHOT
diff --git a/frontend/pom.xml b/frontend/pom.xml
index 1f1f535c6..be2ab8939 100644
--- a/frontend/pom.xml
+++ b/frontend/pom.xml
@@ -1,22 +1,22 @@
4.0.0
- macrobase
+ edu.stanford.futuredata
macrobase-frontend
jar
0.1-SNAPSHOT
macrobase-frontend
http://maven.apache.org
- macrobase
+ edu.stanford.futuredata
macrobase
0.1-SNAPSHOT
- macrobase
- macrobase-core
+ edu.stanford.futuredata
+ macrobase-legacy
0.1-SNAPSHOT
diff --git a/core/pom.xml b/legacy/pom.xml
similarity index 75%
rename from core/pom.xml
rename to legacy/pom.xml
index f498e55b9..ed21d9ec6 100644
--- a/core/pom.xml
+++ b/legacy/pom.xml
@@ -1,14 +1,14 @@
4.0.0
- macrobase
- macrobase-core
+ edu.stanford.futuredata
+ macrobase-legacy
jar
0.1-SNAPSHOT
- macrobase-core
+ macrobase-legacy
http://maven.apache.org
- macrobase
+ edu.stanford.futuredata
macrobase
0.1-SNAPSHOT
@@ -61,6 +61,26 @@
1.1.1
test
+
+ io.dropwizard
+ dropwizard-core
+ ${dropwizard.version}
+
+
+ io.dropwizard
+ dropwizard-assets
+ ${dropwizard.version}
+
+
+ io.dropwizard
+ dropwizard-logging
+ ${dropwizard.version}
+
+
+ io.dropwizard
+ dropwizard-db
+ ${dropwizard.version}
+
diff --git a/core/src/main/java/macrobase/MacroBase.java b/legacy/src/main/java/macrobase/MacroBase.java
similarity index 100%
rename from core/src/main/java/macrobase/MacroBase.java
rename to legacy/src/main/java/macrobase/MacroBase.java
diff --git a/core/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java
rename to legacy/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java
diff --git a/core/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java
rename to legacy/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java
diff --git a/core/src/main/java/macrobase/analysis/classify/OutlierClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/OutlierClassifier.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/classify/OutlierClassifier.java
rename to legacy/src/main/java/macrobase/analysis/classify/OutlierClassifier.java
diff --git a/core/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java
rename to legacy/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/BasePipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/BasePipeline.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/BasePipeline.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/BasePipeline.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/Pipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/Pipeline.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/Pipeline.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/Pipeline.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java b/legacy/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java
diff --git a/core/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java b/legacy/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java
rename to legacy/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java
diff --git a/core/src/main/java/macrobase/analysis/result/AnalysisResult.java b/legacy/src/main/java/macrobase/analysis/result/AnalysisResult.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/result/AnalysisResult.java
rename to legacy/src/main/java/macrobase/analysis/result/AnalysisResult.java
diff --git a/core/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java b/legacy/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java
rename to legacy/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java
diff --git a/core/src/main/java/macrobase/analysis/sample/AChao.java b/legacy/src/main/java/macrobase/analysis/sample/AChao.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/sample/AChao.java
rename to legacy/src/main/java/macrobase/analysis/sample/AChao.java
diff --git a/core/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java b/legacy/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java
rename to legacy/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java
diff --git a/core/src/main/java/macrobase/analysis/stats/Autocorrelation.java b/legacy/src/main/java/macrobase/analysis/stats/Autocorrelation.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/Autocorrelation.java
rename to legacy/src/main/java/macrobase/analysis/stats/Autocorrelation.java
diff --git a/core/src/main/java/macrobase/analysis/stats/BatchTrainScore.java b/legacy/src/main/java/macrobase/analysis/stats/BatchTrainScore.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/BatchTrainScore.java
rename to legacy/src/main/java/macrobase/analysis/stats/BatchTrainScore.java
diff --git a/core/src/main/java/macrobase/analysis/stats/Covariance.java b/legacy/src/main/java/macrobase/analysis/stats/Covariance.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/Covariance.java
rename to legacy/src/main/java/macrobase/analysis/stats/Covariance.java
diff --git a/core/src/main/java/macrobase/analysis/stats/FFT.java b/legacy/src/main/java/macrobase/analysis/stats/FFT.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/FFT.java
rename to legacy/src/main/java/macrobase/analysis/stats/FFT.java
diff --git a/core/src/main/java/macrobase/analysis/stats/Gaussian.java b/legacy/src/main/java/macrobase/analysis/stats/Gaussian.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/Gaussian.java
rename to legacy/src/main/java/macrobase/analysis/stats/Gaussian.java
diff --git a/core/src/main/java/macrobase/analysis/stats/MAD.java b/legacy/src/main/java/macrobase/analysis/stats/MAD.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/MAD.java
rename to legacy/src/main/java/macrobase/analysis/stats/MAD.java
diff --git a/core/src/main/java/macrobase/analysis/stats/MinCovDet.java b/legacy/src/main/java/macrobase/analysis/stats/MinCovDet.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/MinCovDet.java
rename to legacy/src/main/java/macrobase/analysis/stats/MinCovDet.java
diff --git a/core/src/main/java/macrobase/analysis/stats/RandomProjection.java b/legacy/src/main/java/macrobase/analysis/stats/RandomProjection.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/RandomProjection.java
rename to legacy/src/main/java/macrobase/analysis/stats/RandomProjection.java
diff --git a/core/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java b/legacy/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java
rename to legacy/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java
diff --git a/core/src/main/java/macrobase/analysis/stats/Truncate.java b/legacy/src/main/java/macrobase/analysis/stats/Truncate.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/Truncate.java
rename to legacy/src/main/java/macrobase/analysis/stats/Truncate.java
diff --git a/core/src/main/java/macrobase/analysis/stats/Winsorizer.java b/legacy/src/main/java/macrobase/analysis/stats/Winsorizer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/Winsorizer.java
rename to legacy/src/main/java/macrobase/analysis/stats/Winsorizer.java
diff --git a/core/src/main/java/macrobase/analysis/stats/ZScore.java b/legacy/src/main/java/macrobase/analysis/stats/ZScore.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/stats/ZScore.java
rename to legacy/src/main/java/macrobase/analysis/stats/ZScore.java
diff --git a/core/src/main/java/macrobase/analysis/summary/BatchSummarizer.java b/legacy/src/main/java/macrobase/analysis/summary/BatchSummarizer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/BatchSummarizer.java
rename to legacy/src/main/java/macrobase/analysis/summary/BatchSummarizer.java
diff --git a/core/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java b/legacy/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java
rename to legacy/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java
diff --git a/core/src/main/java/macrobase/analysis/summary/Summarizer.java b/legacy/src/main/java/macrobase/analysis/summary/Summarizer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/Summarizer.java
rename to legacy/src/main/java/macrobase/analysis/summary/Summarizer.java
diff --git a/core/src/main/java/macrobase/analysis/summary/Summary.java b/legacy/src/main/java/macrobase/analysis/summary/Summary.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/Summary.java
rename to legacy/src/main/java/macrobase/analysis/summary/Summary.java
diff --git a/core/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java b/legacy/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java
rename to legacy/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java
diff --git a/core/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java b/legacy/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java
rename to legacy/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java
diff --git a/core/src/main/java/macrobase/analysis/summary/count/ExactCount.java b/legacy/src/main/java/macrobase/analysis/summary/count/ExactCount.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/count/ExactCount.java
rename to legacy/src/main/java/macrobase/analysis/summary/count/ExactCount.java
diff --git a/core/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java b/legacy/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java
rename to legacy/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/Apriori.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/Apriori.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/Apriori.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/Apriori.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java
diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java
rename to legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java
diff --git a/core/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java b/legacy/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java
rename to legacy/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java
diff --git a/core/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java b/legacy/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java
rename to legacy/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java
diff --git a/core/src/main/java/macrobase/analysis/transform/FeatureTransform.java b/legacy/src/main/java/macrobase/analysis/transform/FeatureTransform.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/transform/FeatureTransform.java
rename to legacy/src/main/java/macrobase/analysis/transform/FeatureTransform.java
diff --git a/core/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java b/legacy/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java
rename to legacy/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java
diff --git a/core/src/main/java/macrobase/analysis/transform/LowMetricTransform.java b/legacy/src/main/java/macrobase/analysis/transform/LowMetricTransform.java
similarity index 100%
rename from core/src/main/java/macrobase/analysis/transform/LowMetricTransform.java
rename to legacy/src/main/java/macrobase/analysis/transform/LowMetricTransform.java
diff --git a/core/src/main/java/macrobase/conf/ConfigurationException.java b/legacy/src/main/java/macrobase/conf/ConfigurationException.java
similarity index 100%
rename from core/src/main/java/macrobase/conf/ConfigurationException.java
rename to legacy/src/main/java/macrobase/conf/ConfigurationException.java
diff --git a/core/src/main/java/macrobase/conf/MacroBaseConf.java b/legacy/src/main/java/macrobase/conf/MacroBaseConf.java
similarity index 100%
rename from core/src/main/java/macrobase/conf/MacroBaseConf.java
rename to legacy/src/main/java/macrobase/conf/MacroBaseConf.java
diff --git a/core/src/main/java/macrobase/conf/MacroBaseDefaults.java b/legacy/src/main/java/macrobase/conf/MacroBaseDefaults.java
similarity index 100%
rename from core/src/main/java/macrobase/conf/MacroBaseDefaults.java
rename to legacy/src/main/java/macrobase/conf/MacroBaseDefaults.java
diff --git a/core/src/main/java/macrobase/conf/MissingParameterException.java b/legacy/src/main/java/macrobase/conf/MissingParameterException.java
similarity index 100%
rename from core/src/main/java/macrobase/conf/MissingParameterException.java
rename to legacy/src/main/java/macrobase/conf/MissingParameterException.java
diff --git a/core/src/main/java/macrobase/datamodel/Datum.java b/legacy/src/main/java/macrobase/datamodel/Datum.java
similarity index 100%
rename from core/src/main/java/macrobase/datamodel/Datum.java
rename to legacy/src/main/java/macrobase/datamodel/Datum.java
diff --git a/core/src/main/java/macrobase/ingest/CSVIngester.java b/legacy/src/main/java/macrobase/ingest/CSVIngester.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/CSVIngester.java
rename to legacy/src/main/java/macrobase/ingest/CSVIngester.java
diff --git a/core/src/main/java/macrobase/ingest/DataIngester.java b/legacy/src/main/java/macrobase/ingest/DataIngester.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/DataIngester.java
rename to legacy/src/main/java/macrobase/ingest/DataIngester.java
diff --git a/core/src/main/java/macrobase/ingest/DatumEncoder.java b/legacy/src/main/java/macrobase/ingest/DatumEncoder.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/DatumEncoder.java
rename to legacy/src/main/java/macrobase/ingest/DatumEncoder.java
diff --git a/core/src/main/java/macrobase/ingest/DiskCachingIngester.java b/legacy/src/main/java/macrobase/ingest/DiskCachingIngester.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/DiskCachingIngester.java
rename to legacy/src/main/java/macrobase/ingest/DiskCachingIngester.java
diff --git a/core/src/main/java/macrobase/ingest/MySQLIngester.java b/legacy/src/main/java/macrobase/ingest/MySQLIngester.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/MySQLIngester.java
rename to legacy/src/main/java/macrobase/ingest/MySQLIngester.java
diff --git a/core/src/main/java/macrobase/ingest/PostgresIngester.java b/legacy/src/main/java/macrobase/ingest/PostgresIngester.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/PostgresIngester.java
rename to legacy/src/main/java/macrobase/ingest/PostgresIngester.java
diff --git a/core/src/main/java/macrobase/ingest/SQLIngester.java b/legacy/src/main/java/macrobase/ingest/SQLIngester.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/SQLIngester.java
rename to legacy/src/main/java/macrobase/ingest/SQLIngester.java
diff --git a/core/src/main/java/macrobase/ingest/result/ColumnValue.java b/legacy/src/main/java/macrobase/ingest/result/ColumnValue.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/result/ColumnValue.java
rename to legacy/src/main/java/macrobase/ingest/result/ColumnValue.java
diff --git a/core/src/main/java/macrobase/ingest/result/RowSet.java b/legacy/src/main/java/macrobase/ingest/result/RowSet.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/result/RowSet.java
rename to legacy/src/main/java/macrobase/ingest/result/RowSet.java
diff --git a/core/src/main/java/macrobase/ingest/result/Schema.java b/legacy/src/main/java/macrobase/ingest/result/Schema.java
similarity index 100%
rename from core/src/main/java/macrobase/ingest/result/Schema.java
rename to legacy/src/main/java/macrobase/ingest/result/Schema.java
diff --git a/core/src/main/java/macrobase/runtime/MacroBaseApplication.java b/legacy/src/main/java/macrobase/runtime/MacroBaseApplication.java
similarity index 100%
rename from core/src/main/java/macrobase/runtime/MacroBaseApplication.java
rename to legacy/src/main/java/macrobase/runtime/MacroBaseApplication.java
diff --git a/core/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java b/legacy/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java
similarity index 100%
rename from core/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java
rename to legacy/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java
diff --git a/core/src/main/java/macrobase/util/CheckedSupplier.java b/legacy/src/main/java/macrobase/util/CheckedSupplier.java
similarity index 100%
rename from core/src/main/java/macrobase/util/CheckedSupplier.java
rename to legacy/src/main/java/macrobase/util/CheckedSupplier.java
diff --git a/core/src/main/java/macrobase/util/Periodic.java b/legacy/src/main/java/macrobase/util/Periodic.java
similarity index 100%
rename from core/src/main/java/macrobase/util/Periodic.java
rename to legacy/src/main/java/macrobase/util/Periodic.java
diff --git a/core/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java b/legacy/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java
rename to legacy/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java
diff --git a/core/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java b/legacy/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java
rename to legacy/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java
diff --git a/core/src/test/java/macrobase/analysis/sample/AChaoTest.java b/legacy/src/test/java/macrobase/analysis/sample/AChaoTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/sample/AChaoTest.java
rename to legacy/src/test/java/macrobase/analysis/sample/AChaoTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java b/legacy/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/FFTTest.java b/legacy/src/test/java/macrobase/analysis/stats/FFTTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/FFTTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/FFTTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/GaussianTest.java b/legacy/src/test/java/macrobase/analysis/stats/GaussianTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/GaussianTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/GaussianTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/MADTest.java b/legacy/src/test/java/macrobase/analysis/stats/MADTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/MADTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/MADTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/MinCovDetTest.java b/legacy/src/test/java/macrobase/analysis/stats/MinCovDetTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/MinCovDetTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/MinCovDetTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java b/legacy/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java b/legacy/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/TruncateTest.java b/legacy/src/test/java/macrobase/analysis/stats/TruncateTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/TruncateTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/TruncateTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/WinsorizerTest.java b/legacy/src/test/java/macrobase/analysis/stats/WinsorizerTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/WinsorizerTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/WinsorizerTest.java
diff --git a/core/src/test/java/macrobase/analysis/stats/ZScoreTest.java b/legacy/src/test/java/macrobase/analysis/stats/ZScoreTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/stats/ZScoreTest.java
rename to legacy/src/test/java/macrobase/analysis/stats/ZScoreTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java b/legacy/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java b/legacy/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java b/legacy/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java
diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java
rename to legacy/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java
diff --git a/core/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java b/legacy/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java
rename to legacy/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java
diff --git a/core/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java b/legacy/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java
similarity index 100%
rename from core/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java
rename to legacy/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java
diff --git a/core/src/test/java/macrobase/conf/MacroBaseConfTest.java b/legacy/src/test/java/macrobase/conf/MacroBaseConfTest.java
similarity index 100%
rename from core/src/test/java/macrobase/conf/MacroBaseConfTest.java
rename to legacy/src/test/java/macrobase/conf/MacroBaseConfTest.java
diff --git a/core/src/test/java/macrobase/conf/MockIngester.java b/legacy/src/test/java/macrobase/conf/MockIngester.java
similarity index 100%
rename from core/src/test/java/macrobase/conf/MockIngester.java
rename to legacy/src/test/java/macrobase/conf/MockIngester.java
diff --git a/core/src/test/java/macrobase/ingest/CSVIngesterTest.java b/legacy/src/test/java/macrobase/ingest/CSVIngesterTest.java
similarity index 100%
rename from core/src/test/java/macrobase/ingest/CSVIngesterTest.java
rename to legacy/src/test/java/macrobase/ingest/CSVIngesterTest.java
diff --git a/core/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java b/legacy/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java
similarity index 100%
rename from core/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java
rename to legacy/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java
diff --git a/core/src/test/java/macrobase/ingest/DatumEncoderTest.java b/legacy/src/test/java/macrobase/ingest/DatumEncoderTest.java
similarity index 100%
rename from core/src/test/java/macrobase/ingest/DatumEncoderTest.java
rename to legacy/src/test/java/macrobase/ingest/DatumEncoderTest.java
diff --git a/core/src/test/java/macrobase/ingest/SQLIngesterTest.java b/legacy/src/test/java/macrobase/ingest/SQLIngesterTest.java
similarity index 100%
rename from core/src/test/java/macrobase/ingest/SQLIngesterTest.java
rename to legacy/src/test/java/macrobase/ingest/SQLIngesterTest.java
diff --git a/core/src/test/java/macrobase/pipeline/BasePipelineTest.java b/legacy/src/test/java/macrobase/pipeline/BasePipelineTest.java
similarity index 100%
rename from core/src/test/java/macrobase/pipeline/BasePipelineTest.java
rename to legacy/src/test/java/macrobase/pipeline/BasePipelineTest.java
diff --git a/core/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java b/legacy/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java
similarity index 100%
rename from core/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java
rename to legacy/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java
diff --git a/core/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java b/legacy/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java
similarity index 100%
rename from core/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java
rename to legacy/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java
diff --git a/core/src/test/java/macrobase/pipeline/MockTransform.java b/legacy/src/test/java/macrobase/pipeline/MockTransform.java
similarity index 100%
rename from core/src/test/java/macrobase/pipeline/MockTransform.java
rename to legacy/src/test/java/macrobase/pipeline/MockTransform.java
diff --git a/core/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java b/legacy/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java
similarity index 100%
rename from core/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java
rename to legacy/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java
diff --git a/core/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java b/legacy/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java
similarity index 100%
rename from core/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java
rename to legacy/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java
diff --git a/core/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java b/legacy/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java
similarity index 100%
rename from core/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java
rename to legacy/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java
diff --git a/core/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java b/legacy/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java
similarity index 100%
rename from core/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java
rename to legacy/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java
diff --git a/core/src/test/java/macrobase/util/Drainer.java b/legacy/src/test/java/macrobase/util/Drainer.java
similarity index 100%
rename from core/src/test/java/macrobase/util/Drainer.java
rename to legacy/src/test/java/macrobase/util/Drainer.java
diff --git a/core/src/test/resources/conf/simple.yaml b/legacy/src/test/resources/conf/simple.yaml
similarity index 100%
rename from core/src/test/resources/conf/simple.yaml
rename to legacy/src/test/resources/conf/simple.yaml
diff --git a/core/src/test/resources/data/missingdata.csv b/legacy/src/test/resources/data/missingdata.csv
similarity index 100%
rename from core/src/test/resources/data/missingdata.csv
rename to legacy/src/test/resources/data/missingdata.csv
diff --git a/core/src/test/resources/data/sensor10k.csv.gz b/legacy/src/test/resources/data/sensor10k.csv.gz
similarity index 100%
rename from core/src/test/resources/data/sensor10k.csv.gz
rename to legacy/src/test/resources/data/sensor10k.csv.gz
diff --git a/core/src/test/resources/data/simple.csv b/legacy/src/test/resources/data/simple.csv
similarity index 100%
rename from core/src/test/resources/data/simple.csv
rename to legacy/src/test/resources/data/simple.csv
diff --git a/lib/pom.xml b/lib/pom.xml
new file mode 100644
index 000000000..46997ad75
--- /dev/null
+++ b/lib/pom.xml
@@ -0,0 +1,147 @@
+
+
+ 4.0.0
+ edu.stanford.futuredata
+ macrobase-lib
+ 0.1-SNAPSHOT
+ macrobase
+ https://github.com/stanford-futuredata/macrobase
+
+ MacroBase is an anomaly detection engine designed to prioritize human attention
+ in large-scale datasets and data streams.
+ macrobase-lib provides operators for standalone API usage of classifiers and summarizers.
+
+
+
+ Apache License, Version 2.0
+ http://www.apache.org/licenses/LICENSE-2.0.txt
+ repo
+
+
+
+
+ Peter Bailis
+ pbailis@cs.stanford.edu
+ Stanford
+ http://www.bailis.org
+
+
+
+ scm:git:git://github.com/stanford-futuredata/macrobase.git
+ scm:git:ssh://github.com/stanford-futuredata/macrobase.git
+ https://github.com/stanford-futuredata/macrobase
+
+
+
+
+
+
+ junit
+ junit
+ 4.12
+ test
+
+
+ com.google.guava
+ guava
+ 21.0
+
+
+ org.apache.commons
+ commons-math3
+ 3.6
+
+
+ org.apache.commons
+ commons-csv
+ 1.2
+
+
+
+
+
+ ossrh
+ https://oss.sonatype.org/content/repositories/snapshots
+
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-compiler-plugin
+ 3.3
+
+ 1.8
+ 1.8
+
+
+
+
+
+
+
+ release
+
+
+
+ org.apache.maven.plugins
+ maven-source-plugin
+ 3.0.1
+
+
+ attach-sources
+ verify
+
+ jar-no-fork
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-javadoc-plugin
+ 2.10.4
+
+
+ attach-javadocs
+ verify
+
+ jar
+
+
+
+
+
+ org.apache.maven.plugins
+ maven-gpg-plugin
+ 1.5
+
+
+ sign-artifacts
+ deploy
+
+ sign
+
+
+
+
+
+ org.sonatype.plugins
+ nexus-staging-maven-plugin
+ 1.6.7
+ true
+
+ ossrh
+ https://oss.sonatype.org/
+ false
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java
new file mode 100644
index 000000000..ad13f039b
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java
@@ -0,0 +1,117 @@
+package edu.stanford.futuredata.macrobase.analysis.classify;
+
+import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
+import edu.stanford.futuredata.macrobase.operator.Transformer;
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+
+/**
+ * Classify rows based on high / low values for a single column.
+ * Returns a new dataframe with a column representation the classification status for
+ * each row: 1.0 if outlier, 0.0 otherwise.
+ */
+public class PercentileClassifier implements Transformer {
+ // Parameters
+ private double percentile = 0.5;
+ private boolean includeHigh = true;
+ private boolean includeLow = true;
+ private String columnName;
+ private String outputColumnName = "_OUTLIER";
+
+ // Calculated values
+ private double lowCutoff;
+ private double highCutoff;
+ private DataFrame output;
+
+ public PercentileClassifier(String columnName) {
+ this.columnName = columnName;
+ }
+
+ @Override
+ public void process(DataFrame input) {
+ double[] metrics = input.getDoubleColumnByName(columnName);
+ int len = metrics.length;
+ lowCutoff = new Percentile().evaluate(metrics, percentile);
+ highCutoff = new Percentile().evaluate(metrics, 100.0 - percentile);
+
+ output = input.copy();
+ double[] resultColumn = new double[len];
+ for (int i = 0; i < len; i++) {
+ double curVal = metrics[i];
+ if ((curVal > highCutoff && includeHigh)
+ || (curVal < lowCutoff && includeLow)
+ ) {
+ resultColumn[i] = 1.0;
+ }
+ }
+ output.addDoubleColumn(outputColumnName, resultColumn);
+ }
+
+ @Override
+ public DataFrame getResults() {
+ return output;
+ }
+
+ // Parameter Getters and Setters
+ public double getPercentile() {
+ return percentile;
+ }
+
+ /**
+ * @param percentile Cutoff point for high or low values
+ * @return this
+ */
+ public PercentileClassifier setPercentile(double percentile) {
+ this.percentile = percentile;
+ return this;
+ }
+ public boolean isIncludeHigh() {
+ return includeHigh;
+ }
+
+ /**
+ * @param includeHigh Whether to count high points as outliers.
+ * @return this
+ */
+ public PercentileClassifier setIncludeHigh(boolean includeHigh) {
+ this.includeHigh = includeHigh;
+ return this;
+ }
+ public boolean isIncludeLow() {
+ return includeLow;
+ }
+
+ /**
+ * @param includeLow Whether to count low points as outliers
+ * @return this
+ */
+ public PercentileClassifier setIncludeLow(boolean includeLow) {
+ this.includeLow = includeLow;
+ return this;
+ }
+ public String getColumnName() {
+ return columnName;
+ }
+ public PercentileClassifier setColumnName(String columnName) {
+ this.columnName = columnName;
+ return this;
+ }
+ public String getOutputColumnName() {
+ return outputColumnName;
+ }
+
+ /**
+ * @param outputColumnName Which column to write the classification results.
+ * @return this
+ */
+ public PercentileClassifier setOutputColumnName(String outputColumnName) {
+ this.outputColumnName = outputColumnName;
+ return this;
+ }
+
+ public double getLowCutoff() {
+ return lowCutoff;
+ }
+ public double getHighCutoff() {
+ return highCutoff;
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java
new file mode 100644
index 000000000..c9fec3dc3
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java
@@ -0,0 +1,133 @@
+package edu.stanford.futuredata.macrobase.analysis.summary;
+
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.AttributeEncoder;
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.FPGrowthEmerging;
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet;
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetResult;
+import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
+import edu.stanford.futuredata.macrobase.datamodel.Schema;
+import edu.stanford.futuredata.macrobase.operator.Operator;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.Set;
+import java.util.function.DoublePredicate;
+
+/**
+ * Given a batch of rows with an outlier class column, explain the outliers using
+ * string attribute columns. Each batch is considered as an independent unit.
+ */
+public class BatchSummarizer implements Operator {
+ // Parameters
+ private String outlierColumn = "_OUTLIER";
+ private double minOutlierSupport = 0.1;
+ private double minRiskRatio = 3;
+ private boolean useAttributeCombinations = true;
+ private List attributes = new ArrayList<>();
+ private DoublePredicate predicate = d -> d != 0.0;
+
+ // Output
+ private Explanation explanation = null;
+ // Encoder
+ private AttributeEncoder encoder = new AttributeEncoder();
+ private List> inlierItemsets, outlierItemsets;
+ private FPGrowthEmerging fpg = new FPGrowthEmerging();
+
+ public BatchSummarizer() { }
+
+ /**
+ * Adjust this to tune the significance (e.g. number of rows affected) of the results returned.
+ * @param minSupport lowest outlier support of the results returned.
+ * @return this
+ */
+ public BatchSummarizer setMinSupport(double minSupport) {
+ this.minOutlierSupport = minSupport;
+ return this;
+ }
+
+ /**
+ * Adjust this to tune the severity (e.g. strength of correlation) of the results returned.
+ * @param minRiskRatio lowest risk ratio to consider for meaningful explanations.
+ * @return this
+ */
+ public BatchSummarizer setMinRiskRatio(double minRiskRatio) {
+ this.minRiskRatio = minRiskRatio;
+ return this;
+ }
+
+ /**
+ * By default, will check for nonzero entries in a column of doubles.
+ * @param predicate function to signify whether row should be treated as outlier.
+ * @return this
+ */
+ public BatchSummarizer setOutlierPredicate(DoublePredicate predicate) {
+ this.predicate = predicate;
+ return this;
+ }
+ public BatchSummarizer setAttributes(List attributes) {
+ this.attributes = attributes;
+ this.encoder.setColumnNames(attributes);
+ return this;
+ }
+
+ /**
+ * Set the column which indicates outlier status. "_OUTLIER" by default.
+ * @param outlierColumn new outlier indicator column.
+ * @return this
+ */
+ public BatchSummarizer setOutlierColumn(String outlierColumn) {
+ this.outlierColumn = outlierColumn;
+ return this;
+ }
+
+ /**
+ * Whether or not to use combinations of attributes in explanation, or only
+ * use simple single attribute explanations
+ * @param useAttributeCombinations flag
+ * @return this
+ */
+ public BatchSummarizer setUseAttributeCombinations(boolean useAttributeCombinations) {
+ this.useAttributeCombinations = useAttributeCombinations;
+ fpg.setCombinationsEnabled(useAttributeCombinations);
+ return this;
+ }
+
+ @Override
+ public void process(DataFrame df) {
+ // Filter inliers and outliers
+ DataFrame outlierDF = df.filter(outlierColumn, predicate);
+ DataFrame inlierDF = df.filter(outlierColumn, predicate.negate());
+
+ // Encode inlier and outlier attribute columns
+ if (attributes.isEmpty()) {
+ encoder.setColumnNames(df.getSchema().getColumnNamesByType(Schema.ColType.STRING));
+ inlierItemsets = encoder.encodeAttributes(inlierDF.getStringCols());
+ outlierItemsets = encoder.encodeAttributes(outlierDF.getStringCols());
+ } else {
+ encoder.setColumnNames(attributes);
+ inlierItemsets = encoder.encodeAttributes(inlierDF.getStringColsByName(attributes));
+ outlierItemsets = encoder.encodeAttributes(outlierDF.getStringColsByName(attributes));
+ }
+
+ long startTime = System.currentTimeMillis();
+ List itemsetResults = fpg.getEmergingItemsetsWithMinSupport(
+ inlierItemsets,
+ outlierItemsets,
+ minOutlierSupport,
+ minRiskRatio);
+ // Decode results
+ List attributeSets = new ArrayList<>();
+ itemsetResults.forEach(i -> attributeSets.add(new AttributeSet(i, encoder)));
+ long elapsed = System.currentTimeMillis() - startTime;
+
+ explanation = new Explanation(attributeSets,
+ inlierItemsets.size(),
+ outlierItemsets.size(),
+ elapsed);
+ }
+
+ @Override
+ public Explanation getResults() {
+ return explanation;
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java
new file mode 100644
index 000000000..c201e11cb
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java
@@ -0,0 +1,68 @@
+package edu.stanford.futuredata.macrobase.analysis.summary;
+
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet;
+
+import java.util.List;
+
+/**
+ * Represents a summarization result, which contains a list of attribute values
+ * and other statistics about the underlying process, e.g. num of tuples observed
+ * so far.
+ */
+public class Explanation {
+ private final long numOutliers;
+ private final long numInliers;
+ private List itemsets;
+ private final long creationTimeMs;
+
+ public Explanation(List resultList,
+ long numInliers,
+ long numOutliers,
+ long creationTimeMs) {
+ itemsets = resultList;
+ this.numInliers = numInliers;
+ this.numOutliers = numOutliers;
+ this.creationTimeMs = creationTimeMs;
+ }
+
+ public List getItemsets() {
+ return itemsets;
+ }
+
+ public long getNumOutliers() {
+ return numOutliers;
+ }
+
+ public long getNumInliers() {
+ return numInliers;
+ }
+
+ public long getCreationTimeMs() {
+ return creationTimeMs;
+ }
+
+ public String prettyPrint() {
+ StringBuilder header = new StringBuilder(String.format(
+ "Outlier Explanation:\n"
+ + "numOutliers: %d\n"
+ + "numInliners: %d\n"
+ + "Itemsets: \n"
+ + "--------\n",
+ numOutliers,
+ numInliers,
+ itemsets));
+ for (AttributeSet is : itemsets) {
+ header.append(is.prettyPrint());
+ }
+ return header.toString();
+ }
+
+ @Override
+ public String toString() {
+ return "Explanation{" +
+ "numOutliers=" + numOutliers +
+ ", numInliers=" + numInliers +
+ ", itemsets=" + itemsets +
+ '}';
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java
new file mode 100644
index 000000000..55efbc71c
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java
@@ -0,0 +1,28 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.count;
+
+import java.util.HashMap;
+import java.util.List;
+import java.util.Set;
+
+
+public class ExactCount {
+ private HashMap counts = new HashMap<>();
+
+ public HashMap getCounts() {
+ return counts;
+ }
+
+ public ExactCount count(List> transactions) {
+ for (Set txn : transactions) {
+ for (int i : txn) {
+ Double curVal = counts.get(i);
+ if (curVal == null) {
+ curVal = 0.;
+ }
+ counts.put(i, curVal + 1);
+ }
+ }
+
+ return this;
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java
new file mode 100644
index 000000000..d401fda8c
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java
@@ -0,0 +1,70 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset;
+
+import java.util.*;
+
+/**
+ * Encode every combination of attribute names and values into a distinct integer.
+ * This class assumes that attributes are stored in String columns in dataframes
+ * and is mainly used for frequent itemset mining.
+ */
+public class AttributeEncoder {
+ private HashMap> encoder;
+ private int nextKey;
+
+ private HashMap valueDecoder;
+ private HashMap columnDecoder;
+ private List colNames;
+
+ public AttributeEncoder() {
+ encoder = new HashMap<>();
+ nextKey = 0;
+ valueDecoder = new HashMap<>();
+ columnDecoder = new HashMap<>();
+ }
+ public void setColumnNames(List colNames) {
+ this.colNames = colNames;
+ }
+
+ public int decodeColumn(int i) {return columnDecoder.get(i);}
+ public String decodeColumnName(int i) {return colNames.get(columnDecoder.get(i));}
+ public String decodeValue(int i) {return valueDecoder.get(i);}
+
+ public List> encodeAttributes(List columns) {
+ if (columns.isEmpty()) {
+ return new ArrayList<>();
+ }
+
+ int d = columns.size();
+ int numRows = columns.get(0).length;
+
+ for (int i = 0; i < d; i++) {
+ if (!encoder.containsKey(i)) {
+ encoder.put(i, new HashMap<>());
+ }
+ }
+
+ ArrayList> encodedAttributes = new ArrayList<>(numRows);
+ for (int i = 0; i < numRows; i++) {
+ encodedAttributes.add(new HashSet<>());
+ }
+
+ for (int colIdx = 0; colIdx < d; colIdx++) {
+ Map curColEncoder = encoder.get(colIdx);
+ String[] curCol = columns.get(colIdx);
+ for (int rowIdx = 0; rowIdx < numRows; rowIdx++) {
+ String colVal = curCol[rowIdx];
+ if (!curColEncoder.containsKey(colVal)) {
+ curColEncoder.put(colVal, nextKey);
+ valueDecoder.put(nextKey, colVal);
+ columnDecoder.put(nextKey, colIdx);
+ nextKey++;
+ }
+ int curKey = curColEncoder.get(colVal);
+ encodedAttributes.get(rowIdx).add(curKey);
+ }
+ }
+
+ return encodedAttributes;
+ }
+
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java
new file mode 100644
index 000000000..59a311807
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java
@@ -0,0 +1,472 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset;
+
+import com.google.common.collect.Sets;
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetWithCount;
+
+import java.util.*;
+import java.util.stream.Collectors;
+
+
+public class FPGrowth {
+ class FPTree {
+ private FPTreeNode root = new FPTreeNode(-1, null, 0);
+ // used to calculate the order
+ private Map frequentItemCounts = new HashMap<>();
+
+ // item order -- need canonical to break ties; 0 is smallest, N is largest
+ private Map frequentItemOrder = new HashMap<>();
+
+ protected Map nodeHeaders = new HashMap<>();
+
+// protected void printTreeDebug() {
+// log.debug("Frequent Item Counts:");
+// frequentItemCounts.entrySet().forEach(e -> log.debug(String.format("%d: %f", e.getKey(), e.getValue())));
+//
+// walkTree(root, 1);
+// }
+
+// private void walkTree(FPTreeNode start, int treeDepth) {
+// log.debug(String.format("%s node: %d, count: %f",
+// new String(new char[treeDepth]).replaceAll("\0", "\t"),
+// start.getItem(), start.getCount()));
+// if (start.getChildren() != null) {
+// for (FPTreeNode child : start.getChildren()) {
+// walkTree(child, treeDepth + 1);
+// }
+// }
+// }
+
+ private class FPTreeNode {
+ private int item;
+ private double count;
+ private FPTreeNode nextLink;
+ private FPTreeNode parent;
+ private List children;
+
+ public FPTreeNode(int item, FPTreeNode parent, int initialCount) {
+ this.item = item;
+ this.parent = parent;
+ this.count = initialCount;
+ }
+
+ public int getItem() {
+ return item;
+ }
+
+ public double getCount() {
+ return count;
+ }
+
+ public void incrementCount(double by) {
+ count += by;
+ }
+
+ public void setNextLink(FPTreeNode nextLink) {
+ this.nextLink = nextLink;
+ }
+
+ public FPTreeNode getNextLink() {
+ return nextLink;
+ }
+
+ public FPTreeNode getParent() {
+ return parent;
+ }
+
+
+ public List getChildren() {
+ return children;
+ }
+
+ // insert the transaction at this node starting with transaction[currentIndex]
+ // then find the child that matches
+ public void insertTransaction(List fullTransaction,
+ int currentIndex,
+ final double transactionCount) {
+ incrementCount(transactionCount);
+
+ if (currentIndex == fullTransaction.size()) {
+ return;
+ }
+
+ int currentItem = fullTransaction.get(currentIndex);
+
+ FPTreeNode matchingChild = null;
+
+ if (children != null) {
+ for (FPTreeNode child : children) {
+ if (child.getItem() == currentItem) {
+ matchingChild = child;
+ break;
+ }
+ }
+ }
+
+ if (matchingChild == null) {
+ matchingChild = new FPTreeNode(currentItem, this, 0);
+
+ FPTreeNode prevHeader = nodeHeaders.get(currentItem);
+ nodeHeaders.put(currentItem, matchingChild);
+
+ if (prevHeader != null) {
+ matchingChild.setNextLink(prevHeader);
+ }
+
+ if (children == null) {
+ children = new ArrayList<>();
+ }
+
+ children.add(matchingChild);
+ }
+
+ matchingChild.insertTransaction(fullTransaction, currentIndex + 1, transactionCount);
+ }
+ }
+
+ public void setFrequentCounts(Map counts) {
+ frequentItemCounts = counts;
+ sortFrequentItems();
+ }
+
+ public void insertFrequentItems(List> transactions,
+ int countRequiredForSupport) {
+
+ Map itemCounts = new HashMap<>();
+ for (Set t : transactions) {
+ for (Integer item : t) {
+ itemCounts.compute(item, (k, v) -> v == null ? 1 : v + 1);
+ }
+ }
+
+ for (Map.Entry e : itemCounts.entrySet()) {
+ if (e.getValue() >= countRequiredForSupport) {
+ frequentItemCounts.put(e.getKey(), e.getValue());
+ }
+ }
+
+ sortFrequentItems();
+ }
+
+ private void sortFrequentItems() {
+ // we have to materialize a canonical order so that items with equal counts
+ // are consistently ordered when they are sorted during transaction insertion
+ List> sortedItemCounts = new ArrayList<>(frequentItemCounts.entrySet());
+ sortedItemCounts.sort((i1, i2) -> frequentItemCounts.get(i1.getKey())
+ .compareTo(frequentItemCounts.get(i2.getKey())));
+ for (int i = 0; i < sortedItemCounts.size(); ++i) {
+ frequentItemOrder.put(sortedItemCounts.get(i).getKey(), i);
+ }
+ }
+
+ public void insertConditionalFrequentItems(List patterns,
+ int countRequiredForSupport) {
+ Map itemCounts = new HashMap<>();
+
+ for (ItemsetWithCount i : patterns) {
+ for (Integer item : i.getItems()) {
+ itemCounts.compute(item, (k, v) -> v == null ? i.getCount() : v + i.getCount());
+ }
+ }
+
+ for (Map.Entry e : itemCounts.entrySet()) {
+ if (e.getValue() >= countRequiredForSupport) {
+ frequentItemCounts.put(e.getKey(), e.getValue());
+ }
+ }
+
+ // we have to materialize a canonical order so that items with equal counts
+ // are consistently ordered when they are sorted during transaction insertion
+ List> sortedItemCounts = new ArrayList<>(frequentItemCounts.entrySet());
+ sortedItemCounts.sort((i1, i2) -> frequentItemCounts.get(i1.getKey())
+ .compareTo(frequentItemCounts.get(i2.getKey())));
+ for (int i = 0; i < sortedItemCounts.size(); ++i) {
+ frequentItemOrder.put(sortedItemCounts.get(i).getKey(), i);
+ }
+ }
+
+ public void insertConditionalFrequentPatterns(List patterns) {
+ for (ItemsetWithCount is : patterns) {
+ List filtered = is.getItems().stream().filter(i -> frequentItemCounts.containsKey(i)).collect(
+ Collectors.toList());
+ filtered.sort((i1, i2) -> frequentItemOrder.get(i2).compareTo(frequentItemOrder.get(i1)));
+ root.insertTransaction(filtered, 0, is.getCount());
+ }
+ }
+
+ public void insertTransactions(List> transactions) {
+ for (Set t : transactions) {
+ List filtered = t.stream().filter(i -> frequentItemCounts.containsKey(i)).collect(
+ Collectors.toList());
+
+ if (!filtered.isEmpty()) {
+ filtered.sort((i1, i2) -> frequentItemOrder.get(i2).compareTo(frequentItemOrder.get(i1)));
+ root.insertTransaction(filtered, 0, 1);
+ }
+ }
+ }
+
+ public int getSupport(Set pattern) {
+ for (Integer i : pattern) {
+ if (!frequentItemCounts.containsKey(i)) {
+ return 0;
+ }
+ }
+
+ List plist = new ArrayList<>(pattern);
+ // traverse bottom to top
+ plist.sort((i1, i2) -> frequentItemOrder.get(i1).compareTo(frequentItemOrder.get(i2)));
+
+ int count = 0;
+ FPTreeNode pathHead = nodeHeaders.get(plist.get(0));
+ while (pathHead != null) {
+ FPTreeNode curNode = pathHead;
+ int itemsToFind = plist.size();
+
+ while (curNode != null) {
+ if (pattern.contains(curNode.getItem())) {
+ itemsToFind -= 1;
+ }
+
+ if (itemsToFind == 0) {
+ count += pathHead.count;
+ break;
+ }
+
+ curNode = curNode.getParent();
+ }
+ pathHead = pathHead.getNextLink();
+ }
+
+ return count;
+ }
+
+
+ List mineItemsets(Integer supportCountRequired) {
+ List singlePathItemsets = new ArrayList<>();
+ List branchingItemsets = new ArrayList<>();
+
+ // mine single-path itemsets first
+ FPTreeNode curNode = root;
+ FPTreeNode nodeOfBranching = null;
+ Set singlePathNodes = new HashSet<>();
+ while (true) {
+ if (curNode.children != null && curNode.children.size() > 1) {
+ nodeOfBranching = curNode;
+ break;
+ }
+
+ if (curNode != root) {
+ singlePathNodes.add(curNode);
+ }
+
+ if (curNode.children == null || curNode.children.size() == 0) {
+ break;
+ } else {
+ curNode = curNode.children.get(0);
+ }
+ }
+
+ for (Set subset : Sets.powerSet(singlePathNodes)) {
+ if (subset.isEmpty()) {
+ continue;
+ }
+
+ double minSupportInSubset = -1;
+ Set items = new HashSet<>();
+ for (FPTreeNode n : subset) {
+ items.add(n.getItem());
+
+ if (minSupportInSubset == -1 || n.getCount() < minSupportInSubset) {
+ minSupportInSubset = n.getCount();
+ }
+ }
+
+ assert (minSupportInSubset >= supportCountRequired);
+ singlePathItemsets.add(new ItemsetWithCount(items, minSupportInSubset));
+ }
+
+ // the entire tree was a single path...
+ if (nodeOfBranching == null) {
+ return singlePathItemsets;
+ }
+
+ // all of the items in the single path will have been mined now
+ // due to the descending frequency count of the FPTree structure, so
+ // we remove them from consideration in the rest
+
+ // instead of destructively removing the nodes from NodeHeader table
+ // which would be valid but would make mining non-idempotent, we
+ // instead store the nodes to skip in a separate set
+
+ Set alreadyMinedItems = new HashSet<>();
+ for (FPTreeNode node : singlePathNodes) {
+ alreadyMinedItems.add(node.getItem());
+ }
+
+ for (Map.Entry header : nodeHeaders.entrySet()) {
+ if (alreadyMinedItems.contains(header.getKey())) {
+ continue;
+ }
+
+ // add the singleton item set
+ branchingItemsets.add(new ItemsetWithCount(Sets.newHashSet(header.getKey()),
+ frequentItemCounts.get(header.getKey())));
+
+ List conditionalPatternBase = new ArrayList<>();
+
+ // walk each "leaf" node
+ FPTreeNode conditionalNode = header.getValue();
+ while (conditionalNode != null) {
+ final double leafSupport = conditionalNode.getCount();
+
+ // walk the tree up to the branch node
+ Set conditionalPattern = new HashSet<>();
+ FPTreeNode walkNode = conditionalNode.getParent();
+ while (walkNode != nodeOfBranching.getParent() && walkNode != root) {
+ conditionalPattern.add(walkNode.getItem());
+ walkNode = walkNode.getParent();
+ }
+
+ if (conditionalPattern.size() > 0) {
+ conditionalPatternBase.add(new ItemsetWithCount(conditionalPattern, leafSupport));
+ }
+
+ conditionalNode = conditionalNode.getNextLink();
+ }
+
+ if (conditionalPatternBase.isEmpty()) {
+ continue;
+ }
+
+ // build and mine the conditional FPTree
+ FPTree conditionalTree = new FPTree();
+ conditionalTree.insertConditionalFrequentItems(conditionalPatternBase, supportCountRequired);
+ conditionalTree.insertConditionalFrequentPatterns(conditionalPatternBase);
+ List conditionalFrequentItemsets = conditionalTree.mineItemsets(supportCountRequired);
+
+ if (!conditionalFrequentItemsets.isEmpty()) {
+ for (ItemsetWithCount is : conditionalFrequentItemsets) {
+ is.getItems().add(header.getKey());
+ }
+
+ branchingItemsets.addAll(conditionalFrequentItemsets);
+ }
+ }
+
+ if (singlePathItemsets.isEmpty()) {
+ return branchingItemsets;
+ }
+
+ // take the cross product of the mined itemsets
+ List ret = new ArrayList<>();
+
+ ret.addAll(singlePathItemsets);
+ ret.addAll(branchingItemsets);
+
+ for (ItemsetWithCount i : singlePathItemsets) {
+ for (ItemsetWithCount j : branchingItemsets) {
+ Set combinedItems = new HashSet<>();
+ combinedItems.addAll(i.getItems());
+ combinedItems.addAll(j.getItems());
+
+ ret.add(new ItemsetWithCount(combinedItems, Math.min(i.getCount(), j.getCount())));
+ }
+ }
+
+ return ret;
+ }
+ }
+
+
+ public List getItemsetsWithSupportRatio(List> transactions,
+ Double supportRatio) {
+ return getItemsetsWithSupportRatio(transactions, null, supportRatio);
+ }
+
+ public List getItemsetsWithSupportRatio(List> transactions,
+ Map initialCounts,
+ Double supportRatio) {
+ return getItemsetsWithSupportCount(transactions, initialCounts, supportRatio * transactions.size());
+ }
+
+ public List getItemsetsWithSupportCount(List> transactions,
+ Double supportCount) {
+ return getItemsetsWithSupportCount(transactions, null, supportCount);
+
+ }
+
+ public List getItemsetsWithSupportCount(List> transactions,
+ Map initialCounts,
+ Double supportCount) {
+ return getItemsetsWithSupportCount(transactions, initialCounts, supportCount, false);
+ }
+
+ protected FPTree constructTree(List> transactions, int supportCount) {
+ FPTree fp = new FPTree();
+ fp.insertFrequentItems(transactions, supportCount);
+ fp.insertTransactions(transactions);
+ return fp;
+ }
+
+ public List getItemsetsWithSupportCount(List> transactions,
+ Map initialCounts,
+ Double supportCount,
+ boolean printTreeDebug) {
+ FPTree fp = new FPTree();
+ int countRequiredForSupport = supportCount.intValue();
+// log.debug("count required: {}", countRequiredForSupport);
+
+ long st = System.currentTimeMillis();
+
+ if (initialCounts == null) {
+ fp.insertFrequentItems(transactions, countRequiredForSupport);
+ } else {
+ fp.setFrequentCounts(initialCounts);
+ }
+
+ fp.insertFrequentItems(transactions, countRequiredForSupport);
+ fp.insertTransactions(transactions);
+ long en = System.currentTimeMillis();
+
+// log.debug("FPTree load: {}", en - st);
+
+ //fp.printTreeDebug();
+
+ st = System.currentTimeMillis();
+ List ret = fp.mineItemsets(countRequiredForSupport);
+ en = System.currentTimeMillis();
+
+// log.debug("FPTree mine: {}", en - st);
+
+ return ret;
+ }
+
+ // ugh, this is a really ugly function sig, but it's efficient
+ public List getCounts(
+ List> transactions,
+ Map initialCounts,
+ Set targetItems,
+ List toCount) {
+ FPTree countTree = new FPTree();
+
+ Map frequentCounts = new HashMap<>();
+
+ for (Integer i : targetItems) {
+ Double initialCount = initialCounts.get(i);
+ if (initialCount == null) {
+ initialCount = 0.;
+ }
+ frequentCounts.put(i, initialCount);
+ }
+
+ countTree.setFrequentCounts(frequentCounts);
+ countTree.insertTransactions(transactions);
+
+ List ret = new ArrayList<>();
+ for (ItemsetWithCount c : toCount) {
+ ret.add(new ItemsetWithCount(c.getItems(), countTree.getSupport(c.getItems())));
+ }
+
+ return ret;
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java
new file mode 100644
index 000000000..02669039c
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java
@@ -0,0 +1,184 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset;
+
+import com.google.common.collect.Sets;
+import edu.stanford.futuredata.macrobase.analysis.summary.count.ExactCount;
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetResult;
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetWithCount;
+
+import java.util.*;
+
+
+public class FPGrowthEmerging {
+ private boolean combinationsEnabled = true;
+
+ public FPGrowthEmerging() {};
+ public FPGrowthEmerging setCombinationsEnabled(boolean flag) {
+ this.combinationsEnabled = flag;
+ return this;
+ }
+
+
+ private List getSingletonItemsets(List> inliers,
+ List> outliers,
+ double minSupport,
+ double minRatio) {
+ int supportCountRequired = (int) (outliers.size() * minSupport);
+
+ List ret = new ArrayList<>();
+
+ Map inlierCounts = new ExactCount().count(inliers).getCounts();
+ Map outlierCounts = new ExactCount().count(outliers).getCounts();
+
+ for (Map.Entry attrOutlierCountEntry : outlierCounts.entrySet()) {
+ if (attrOutlierCountEntry.getValue() < supportCountRequired) {
+ continue;
+ }
+
+ int item = attrOutlierCountEntry.getKey();
+ Double attrInlierCount = inlierCounts.get(item);
+
+ double ratio = RiskRatio.compute(attrInlierCount,
+ attrOutlierCountEntry.getValue(),
+ inliers.size(),
+ outliers.size());
+
+ if (ratio > minRatio) {
+ ret.add(new ItemsetResult(
+ attrOutlierCountEntry.getValue() / outliers.size(),
+ attrOutlierCountEntry.getValue(),
+ ratio,
+ Collections.singleton(item)
+ ));
+ }
+ }
+ return ret;
+ }
+
+ public List getEmergingItemsetsWithMinSupport(List> inliers,
+ List> outliers,
+ double minSupport,
+ double minRatio) {
+ if (!combinationsEnabled || (inliers.size() > 0 && inliers.get(0).size() == 1)) {
+ return getSingletonItemsets(inliers, outliers, minSupport, minRatio);
+ }
+
+ ArrayList> outlierTransactions = new ArrayList<>();
+
+ Map inlierCounts = new ExactCount().count(inliers).getCounts();
+ Map outlierCounts = new ExactCount().count(outliers).getCounts();
+
+ Map supportedOutlierCounts = new HashMap<>();
+
+ int supportCountRequired = (int) (outliers.size() * minSupport);
+
+ for (Set o: outliers) {
+ Set txn = null;
+
+ for (int i : o) {
+ double outlierCount = outlierCounts.get(i);
+ if (outlierCount >= supportCountRequired) {
+ Number inlierCount = inlierCounts.get(i);
+
+ double outlierInlierRatio = RiskRatio.compute(inlierCount,
+ outlierCount,
+ inliers.size(),
+ outliers.size());
+
+ if (outlierInlierRatio > minRatio) {
+ if (txn == null) {
+ txn = new HashSet<>();
+ }
+
+ if (!supportedOutlierCounts.containsKey(i)) {
+ supportedOutlierCounts.put(i, outlierCount);
+ }
+
+ txn.add(i);
+ }
+ }
+ }
+
+ if (txn != null) {
+ outlierTransactions.add(txn);
+ }
+ }
+
+ FPGrowth fpg = new FPGrowth();
+ List iwc = fpg.getItemsetsWithSupportCount(
+ outlierTransactions,
+ supportedOutlierCounts,
+ outliers.size() * minSupport);
+
+ iwc.sort((x, y) -> x.getCount() != y.getCount() ?
+ -Double.compare(x.getCount(), y.getCount()) :
+ -Double.compare(x.getItems().size(), y.getItems().size()));
+
+ Set ratioItemsToCheck = new HashSet<>();
+ List ratioSetsToCheck = new ArrayList<>();
+ List ret = new ArrayList<>();
+
+ Set prevSet = null;
+ Double prevCount = -1.;
+ for (ItemsetWithCount i : iwc) {
+ if (i.getCount() == prevCount) {
+ if (prevSet != null && Sets.difference(i.getItems(), prevSet).size() == 0) {
+ continue;
+ }
+ }
+
+
+ prevCount = i.getCount();
+ prevSet = i.getItems();
+
+ if (i.getItems().size() == 1) {
+ Number inlierCount = inlierCounts.get(i.getItems().iterator().next());
+
+ double ratio = RiskRatio.compute(inlierCount,
+ i.getCount(),
+ inliers.size(),
+ outliers.size());
+
+ ret.add(new ItemsetResult(i.getCount() / (double) outliers.size(),
+ i.getCount(),
+ ratio,
+ i.getItems()));
+ } else {
+ ratioItemsToCheck.addAll(i.getItems());
+ ratioSetsToCheck.add(i);
+ }
+ }
+
+ // check the ratios of any itemsets we just marked
+ FPGrowth inlierTree = new FPGrowth();
+ List matchingInlierCounts = inlierTree.getCounts(inliers,
+ inlierCounts,
+ ratioItemsToCheck,
+ ratioSetsToCheck);
+
+ assert (matchingInlierCounts.size() == ratioSetsToCheck.size());
+ for (int i = 0; i < matchingInlierCounts.size(); ++i) {
+ ItemsetWithCount ic = matchingInlierCounts.get(i);
+ ItemsetWithCount oc = ratioSetsToCheck.get(i);
+
+ double ratio = RiskRatio.compute(ic.getCount(),
+ oc.getCount(),
+ inliers.size(),
+ outliers.size());
+
+ if (ratio >= minRatio) {
+ ret.add(new ItemsetResult(oc.getCount() / (double) outliers.size(),
+ oc.getCount(),
+ ratio,
+ oc.getItems()));
+ }
+ }
+
+
+ // finally sort one last time
+ ret.sort((x, y) -> x.getNumRecords() != y.getNumRecords() ?
+ -Double.compare(x.getNumRecords(), y.getNumRecords()) :
+ -Double.compare(x.getItems().size(), y.getItems().size()));
+
+ return ret;
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java
new file mode 100644
index 000000000..c4c9ff75e
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java
@@ -0,0 +1,56 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset;
+
+public class RiskRatio {
+ private static double computeDouble(double exposedInlierCount,
+ double exposedOutlierCount,
+ double totalInliers,
+ double totalOutliers) {
+ double totalExposedCount = exposedInlierCount + exposedOutlierCount;
+ double unexposedOutlierCount = (totalOutliers - exposedOutlierCount);
+ double totalMinusExposedCount = totalInliers + totalOutliers - totalExposedCount;
+
+ // no exposure occurred
+ if (totalExposedCount == 0) {
+ return 0;
+ }
+
+ // we only exposed this ratio, everything matched!
+ if (totalMinusExposedCount == 0) {
+ return 0;
+ }
+
+ // all outliers had this pattern
+ if (unexposedOutlierCount == 0) {
+ return Double.POSITIVE_INFINITY;
+ }
+
+ return (exposedOutlierCount / totalExposedCount) /
+ (unexposedOutlierCount / totalMinusExposedCount);
+ }
+
+ public static double compute(Number exposedInlierCount,
+ Number exposedOutlierCount,
+ Number totalInliers,
+ Number totalOutliers) {
+ if(exposedInlierCount == null) {
+ exposedInlierCount = 0.;
+ }
+
+ if(exposedOutlierCount == null) {
+ exposedOutlierCount = 0.;
+ }
+
+ if(totalInliers == null) {
+ totalInliers = 0.;
+ }
+
+ if(totalOutliers == null) {
+ totalOutliers = 0.;
+ }
+
+ return computeDouble(exposedInlierCount.doubleValue(),
+ exposedOutlierCount.doubleValue(),
+ totalInliers.doubleValue(),
+ totalOutliers.doubleValue());
+ }
+}
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java
new file mode 100644
index 000000000..01922f657
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java
@@ -0,0 +1,75 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset.result;
+
+import edu.stanford.futuredata.macrobase.analysis.summary.itemset.AttributeEncoder;
+
+import java.util.HashMap;
+import java.util.Map;
+import java.util.StringJoiner;
+
+public class AttributeSet {
+ private double support;
+ private long numRecords;
+ private double ratioToInliers;
+ private Map items = new HashMap<>();
+
+ public AttributeSet(ItemsetResult its, AttributeEncoder encoder) {
+ this.support = its.getSupport();
+ this.numRecords = (long)its.getNumRecords();
+ this.ratioToInliers = its.getRatioToInliers();
+ its.getItems().forEach(i -> items.put(encoder.decodeColumnName(i), encoder.decodeValue(i)));
+ }
+
+ public AttributeSet(double support,
+ double numRecords,
+ double ratioToInliers,
+ Map items) {
+ this.support = support;
+ this.numRecords = (long)numRecords;
+ this.ratioToInliers = ratioToInliers;
+ this.items = items;
+ }
+
+ public String prettyPrint() {
+ StringJoiner joiner = new StringJoiner("\n");
+ items.forEach((k, v) -> joiner.add(k+"="+v));
+
+ return String.format("support: %f\n" +
+ "records: %d\n" +
+ "ratio: %f\n" +
+ "\nColumns:\n%s\n\n",
+ support,
+ numRecords,
+ ratioToInliers,
+ joiner.toString());
+ }
+
+ public double getSupport() {
+ return support;
+ }
+
+ public double getNumRecords() {
+ return numRecords;
+ }
+
+ public double getRatioToInliers() {
+ return ratioToInliers;
+ }
+
+ public void setRatioToInliers(double ratio) {
+ ratioToInliers = ratio;
+ }
+
+ public Map getItems() {
+ return items;
+ }
+
+ @Override
+ public String toString() {
+ return "AttributeSet{" +
+ "support=" + support +
+ ", numRecords=" + numRecords +
+ ", ratioToInliers=" + ratioToInliers +
+ ", items=" + items +
+ '}';
+ }
+}
\ No newline at end of file
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java
new file mode 100644
index 000000000..6370b0beb
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java
@@ -0,0 +1,46 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset.result;
+
+import java.util.Set;
+
+public class ItemsetResult {
+ private double support;
+ private double numRecords;
+ private double ratioToInliers;
+ private Set items;
+
+ public ItemsetResult(double support,
+ double numRecords,
+ double ratioToInliers,
+ Set items) {
+ this.support = support;
+ this.numRecords = numRecords;
+ this.ratioToInliers = ratioToInliers;
+ this.items = items;
+ }
+
+ public double getSupport() {
+ return support;
+ }
+
+ public double getNumRecords() {
+ return numRecords;
+ }
+
+ public double getRatioToInliers() {
+ return ratioToInliers;
+ }
+
+ public Set getItems() {
+ return items;
+ }
+
+ @Override
+ public String toString() {
+ return "ItemsetResult{" +
+ "support=" + support +
+ ", numRecords=" + numRecords +
+ ", ratioToInliers=" + ratioToInliers +
+ ", items=" + items +
+ '}';
+ }
+}
\ No newline at end of file
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java
new file mode 100644
index 000000000..a603154b3
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java
@@ -0,0 +1,32 @@
+package edu.stanford.futuredata.macrobase.analysis.summary.itemset.result;
+
+import java.util.Set;
+
+public class ItemsetWithCount {
+ private Set items;
+ private double count;
+
+ public ItemsetWithCount(Set items, double count) {
+ this.items = items;
+ this.count = count;
+ }
+
+ public Set getItems() {
+ return items;
+ }
+
+ public double getCount() {
+ return count;
+ }
+
+ @Override
+ public boolean equals(Object o) {
+ if (o == null) {
+ return false;
+ } else if (!(o instanceof ItemsetWithCount)) {
+ return false;
+ }
+ final ItemsetWithCount other = (ItemsetWithCount) o;
+ return (Math.round(other.getCount()) == Math.round(count)) && (other.getItems().equals(items));
+ }
+}
\ No newline at end of file
diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java
new file mode 100644
index 000000000..9d51efba2
--- /dev/null
+++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java
@@ -0,0 +1,317 @@
+package edu.stanford.futuredata.macrobase.datamodel;
+
+import edu.stanford.futuredata.macrobase.util.MacrobaseInternalError;
+
+import java.util.ArrayList;
+import java.util.List;
+import java.util.function.DoublePredicate;
+import java.util.function.Predicate;
+
+/**
+ * Column-based dataframe object.
+ * loadRows and addColumn methods mutate the dataframe and are the primary
+ * ways of initializing the data in the dataframe.
+ */
+public class DataFrame {
+ private Schema schema;
+
+ private ArrayList stringCols;
+ private ArrayList doubleCols;
+ private ArrayList indexToTypeIndex;
+
+ private int numRows;
+
+ public DataFrame() {
+ this.schema = new Schema();
+ this.stringCols = new ArrayList<>();
+ this.doubleCols = new ArrayList<>();
+ this.indexToTypeIndex = new ArrayList<>();
+ this.numRows = 0;
+ }
+
+ /**
+ * Creates a dataframe from a list of rows
+ * Slower than creating a dataframe column by column using addXColumn methods.
+ * @param schema Schema to use
+ * @param rows Data to load
+ */
+ public DataFrame(Schema schema, List rows) {
+ this();
+ this.schema = schema;
+ this.numRows = rows.size();
+ int d = schema.getNumColumns();
+ for (int c = 0; c < d; c++) {
+ Schema.ColType t = schema.getColumnType(c);
+ if (t == Schema.ColType.STRING) {
+ String[] colValues = new String[numRows];
+ for (int i = 0; i < numRows; i++) {
+ colValues[i] = rows.get(i).getAs(c);
+ }
+ addStringColumnInternal(colValues);
+ } else if (t == Schema.ColType.DOUBLE) {
+ double[] colValues = new double[numRows];
+ for (int i = 0; i < numRows; i++) {
+ colValues[i] = rows.get(i).getAs(c);
+ }
+ addDoubleColumnInternal(colValues);
+ } else {
+ throw new MacrobaseInternalError("Invalid ColType");
+ }
+ }
+ }
+
+ /**
+ * @return shallow copy of dataframe
+ */
+ public DataFrame copy() {
+ DataFrame other = new DataFrame();
+ other.schema = schema.copy();
+ other.stringCols = new ArrayList<>(stringCols);
+ other.doubleCols = new ArrayList<>(doubleCols);
+ other.indexToTypeIndex = new ArrayList<>(indexToTypeIndex);
+ other.numRows = numRows;
+ return other;
+ }
+
+ public Schema getSchema() {return this.schema;}
+ public int getNumRows() {return numRows;}
+ public ArrayList getDoubleCols() { return doubleCols; }
+ public ArrayList getStringCols() { return stringCols; }
+
+ // Fast Column-based methods
+ private void addDoubleColumnInternal(double[] colValues) {
+ doubleCols.add(colValues);
+ indexToTypeIndex.add(doubleCols.size()-1);
+ }
+ public DataFrame addDoubleColumn(String colName, double[] colValues) {
+ if (numRows == 0) {
+ numRows = colValues.length;
+ }
+ schema.addColumn(Schema.ColType.DOUBLE, colName);
+ addDoubleColumnInternal(colValues);
+ return this;
+ }
+ private void addStringColumnInternal(String[] colValues) {
+ stringCols.add(colValues);
+ indexToTypeIndex.add(stringCols.size()-1);
+ }
+ public DataFrame addStringColumn(String colName, String[] colValues) {
+ if (numRows == 0) {
+ numRows = colValues.length;
+ }
+ schema.addColumn(Schema.ColType.STRING, colName);
+ addStringColumnInternal(colValues);
+ return this;
+ }
+
+ protected int[] getSubIndices(List columns) {
+ int d = columns.size();
+ int[] typeSubIndices = new int[d];
+ for (int i = 0; i < d; i++) {
+ typeSubIndices[i] = indexToTypeIndex.get(columns.get(i));
+ }
+ return typeSubIndices;
+ }
+
+ public double[] getDoubleColumn(int columnIdx) {
+ return doubleCols.get(indexToTypeIndex.get(columnIdx));
+ }
+ public double[] getDoubleColumnByName(String columnName) {
+ return doubleCols.get(indexToTypeIndex.get(schema.getColumnIndex(columnName)));
+ }
+ public ArrayList getDoubleCols(List columns) {
+ ArrayList cols = new ArrayList<>();
+ for (int c : columns) {
+ cols.add(getDoubleColumn(c));
+ }
+ return cols;
+ }
+ public ArrayList getDoubleColsByName(List columns) {
+ return getDoubleCols(this.schema.getColumnIndices(columns));
+ }
+ public String[] getStringColumn(int columnIdx) {
+ return stringCols.get(indexToTypeIndex.get(columnIdx));
+ }
+ public String[] getStringColumnByName(String columnName) {
+ return stringCols.get(indexToTypeIndex.get(schema.getColumnIndex(columnName)));
+ }
+ public ArrayList getStringCols(List columns) {
+ ArrayList cols = new ArrayList<>();
+ for (int c : columns) {
+ cols.add(getStringColumn(c));
+ }
+ return cols;
+ }
+ public ArrayList getStringColsByName(List columns) {
+ return getStringCols(this.schema.getColumnIndices(columns));
+ }
+
+ /**
+ * @param columns column indices to project
+ * @return new dataframe with subset of columns
+ */
+ public DataFrame select(List columns) {
+ DataFrame other = new DataFrame();
+ for (int c : columns) {
+ String columnName = schema.getColumnName(c);
+ Schema.ColType t = schema.getColumnType(c);
+ if (t == Schema.ColType.STRING) {
+ other.addStringColumn(columnName, getStringColumn(c));
+ } else if (t == Schema.ColType.DOUBLE) {
+ other.addDoubleColumn(columnName, getDoubleColumn(c));
+ } else {
+ throw new MacrobaseInternalError("Bad Column Type");
+ }
+ }
+ return other;
+ }
+
+ /**
+ * @param columns column names to project
+ * @return new dataframe with subset of columns
+ */
+ public DataFrame selectByName(List columns) {
+ return select(this.schema.getColumnIndices(columns));
+ }
+
+ /**
+ * @param mask rows to select
+ * @return new dataframe with subset of rows
+ */
+ protected DataFrame filter(boolean[] mask) {
+ DataFrame other = new DataFrame();
+
+ int d = schema.getNumColumns();
+ int numTrue = 0;
+ for (int i = 0; i < numRows; i++) {
+ if (mask[i]) {
+ numTrue++;
+ }
+ }
+ for (int c = 0; c < d; c++) {
+ Schema.ColType t = schema.getColumnType(c);
+ String columnName = schema.getColumnName(c);
+ if (t == Schema.ColType.STRING) {
+ String[] oldColumn = getStringColumn(c);
+ String[] newColumn = new String[numTrue];
+ int j = 0;
+ for (int i = 0; i < numRows; i++) {
+ if (mask[i]) {
+ newColumn[j] = oldColumn[i];
+ j++;
+ }
+ }
+ other.addStringColumn(columnName, newColumn);
+ } else if (t == Schema.ColType.DOUBLE) {
+ double[] oldColumn = getDoubleColumn(c);
+ double[] newColumn = new double[numTrue];
+ int j = 0;
+ for (int i = 0; i < numRows; i++) {
+ if (mask[i]) {
+ newColumn[j] = oldColumn[i];
+ j++;
+ }
+ }
+ other.addDoubleColumn(columnName, newColumn);
+ } else {
+ throw new MacrobaseInternalError("Bad Column Type");
+ }
+ }
+ return other;
+ }
+ public DataFrame filter(int columnIdx, Predicate