From fdfd2bb97a206987f7f51dc1cfbf49ec9a0aee9e Mon Sep 17 00:00:00 2001 From: Edward Gan Date: Fri, 24 Mar 2017 17:46:56 -0700 Subject: [PATCH] macrobase-lib 0.1 (#192) * Initial DataFrame code * Change ItemsetEncoder to encode columns * Add get column method for dataframes * Adding select and filter to dataframe * Initial test based on seen usage * Initial refactor of BatchSummarize, FPGrowth, FPGrowthEmerging etc. * dataframe CSV loader * fixing dependencies * dataframe csv loader * updating supervisedtest * Move ItemsetEncoder to summarizer * percentile classifier and unsupervised csv test * support for building source and javadoc jars `mvn verify` will now build two associated jars: one for sources and one for javadocs, which we can upload to maven to make it easier to use the library * Addressed some of Peter's comments - renamed core -> runtime - changed groupId in pom.xml - Rename EncodedItemsetResult->ItemsetResult, ItemsetResult->AttributeSet, ItemsetEncoder->AttributeEncoder - Throw MacrobaseException instead of RuntimeException - Renamed all filter methods in DataFrame to filter() - Removed Apriori and dead code CSVParser and DataFrame not finished * Minor tweaks in filter API * Removing unstable csv parser * Add getRows() in DataFrame; enable attr combinations by default * improving test coverage * changing loadrows to a constructor Trying to make dataframes as immutable as possible * core=>runtime renaming clean up * Rename modules * More javadoc comments * pgp plugin * Changed exception in DataFrame * test coverage, bugfix for summarizer * Add deploy plugins to lib module You can upload the SNAPSHOT module right now by 'mvn clean deploy'! * independent maven package * adding full package namespace --- .travis.yml | 1 + assembly/pom.xml | 10 +- bin/batch.sh | 2 +- bin/server.sh | 2 +- bin/streaming.sh | 2 +- contrib/pom.xml | 8 +- frontend/pom.xml | 8 +- {core => legacy}/pom.xml | 28 +- .../src/main/java/macrobase/MacroBase.java | 0 .../BatchingPercentileClassifier.java | 0 .../EWAppxPercentileOutlierClassifier.java | 0 .../analysis/classify/OutlierClassifier.java | 0 .../classify/StaticThresholdClassifier.java | 0 .../analysis/pipeline/BasePipeline.java | 0 .../pipeline/BasicBatchedPipeline.java | 0 .../BasicOneShotEWStreamingPipeline.java | 0 .../macrobase/analysis/pipeline/Pipeline.java | 0 .../pipeline/operator/MBConsumer.java | 0 .../analysis/pipeline/operator/MBGroupBy.java | 0 .../pipeline/operator/MBOperator.java | 0 .../pipeline/operator/MBProducer.java | 0 .../pipeline/stream/MBMultiInputStream.java | 0 .../analysis/pipeline/stream/MBStream.java | 0 .../analysis/result/AnalysisResult.java | 0 .../result/OutlierClassificationResult.java | 0 .../java/macrobase/analysis/sample/AChao.java | 0 .../sample/FlexibleDampedReservoir.java | 0 .../analysis/stats/Autocorrelation.java | 0 .../analysis/stats/BatchTrainScore.java | 0 .../macrobase/analysis/stats/Covariance.java | 0 .../java/macrobase/analysis/stats/FFT.java | 0 .../macrobase/analysis/stats/Gaussian.java | 0 .../java/macrobase/analysis/stats/MAD.java | 0 .../macrobase/analysis/stats/MinCovDet.java | 0 .../analysis/stats/RandomProjection.java | 0 .../stats/RobustEmpiricalCovariance.java | 0 .../macrobase/analysis/stats/Truncate.java | 0 .../macrobase/analysis/stats/Winsorizer.java | 0 .../java/macrobase/analysis/stats/ZScore.java | 0 .../analysis/summary/BatchSummarizer.java | 0 .../summary/EWStreamingSummarizer.java | 0 .../analysis/summary/Summarizer.java | 0 .../macrobase/analysis/summary/Summary.java | 0 .../count/AmortizedMaintenanceCounter.java | 0 .../summary/count/ApproximateCount.java | 0 .../analysis/summary/count/ExactCount.java | 0 .../summary/count/SpaceSavingList.java | 0 .../analysis/summary/itemset/Apriori.java | 0 ...ExponentiallyDecayingEmergingItemsets.java | 0 .../analysis/summary/itemset/FPGrowth.java | 0 .../summary/itemset/FPGrowthEmerging.java | 0 .../analysis/summary/itemset/RiskRatio.java | 0 .../summary/itemset/StreamingFPGrowth.java | 0 .../summary/itemset/result/ItemsetResult.java | 0 .../itemset/result/ItemsetWithCount.java | 0 .../transform/BatchScoreFeatureTransform.java | 0 .../transform/EWFeatureTransform.java | 0 .../analysis/transform/FeatureTransform.java | 0 .../transform/LinearMetricNormalizer.java | 0 .../transform/LowMetricTransform.java | 0 .../conf/ConfigurationException.java | 0 .../java/macrobase/conf/MacroBaseConf.java | 0 .../macrobase/conf/MacroBaseDefaults.java | 0 .../conf/MissingParameterException.java | 0 .../main/java/macrobase/datamodel/Datum.java | 0 .../java/macrobase/ingest/CSVIngester.java | 0 .../java/macrobase/ingest/DataIngester.java | 0 .../java/macrobase/ingest/DatumEncoder.java | 0 .../macrobase/ingest/DiskCachingIngester.java | 0 .../java/macrobase/ingest/MySQLIngester.java | 0 .../macrobase/ingest/PostgresIngester.java | 0 .../java/macrobase/ingest/SQLIngester.java | 0 .../macrobase/ingest/result/ColumnValue.java | 0 .../java/macrobase/ingest/result/RowSet.java | 0 .../java/macrobase/ingest/result/Schema.java | 0 .../runtime/MacroBaseApplication.java | 0 .../command/MacroBasePipelineCommand.java | 0 .../java/macrobase/util/CheckedSupplier.java | 0 .../main/java/macrobase/util/Periodic.java | 0 .../StaticThresholdClassifierTest.java | 0 .../pipeline/operator/MBGroupByTest.java | 0 .../macrobase/analysis/sample/AChaoTest.java | 0 .../analysis/stats/AutocorrelationTest.java | 0 .../macrobase/analysis/stats/FFTTest.java | 0 .../analysis/stats/GaussianTest.java | 0 .../macrobase/analysis/stats/MADTest.java | 0 .../analysis/stats/MinCovDetTest.java | 0 .../analysis/stats/RandomProjectionTest.java | 0 .../stats/RobustEmpiricalCovarianceTest.java | 0 .../analysis/stats/TruncateTest.java | 0 .../analysis/stats/WinsorizerTest.java | 0 .../macrobase/analysis/stats/ZScoreTest.java | 0 .../AmortizedMaintenanceCounterTest.java | 0 .../summary/count/ExactCountTest.java | 0 .../summary/count/SpaceSavingTest.java | 0 .../analysis/summary/itemset/AprioriTest.java | 0 .../summary/itemset/FPGrowthTest.java | 0 .../summary/itemset/RiskRatioTest.java | 0 .../itemset/StreamingFPGrowthTest.java | 0 .../transform/LinearMetricNormalizerTest.java | 0 .../transform/LowMetricTransformTest.java | 0 .../macrobase/conf/MacroBaseConfTest.java | 0 .../java/macrobase/conf/MockIngester.java | 0 .../macrobase/ingest/CSVIngesterTest.java | 0 .../ingest/CachingSQLIngesterTest.java | 0 .../macrobase/ingest/DatumEncoderTest.java | 0 .../macrobase/ingest/SQLIngesterTest.java | 0 .../macrobase/pipeline/BasePipelineTest.java | 0 .../pipeline/BasicBatchedPipelineTest.java | 0 .../BasicOneShotEWStreamingPipelineTest.java | 0 .../macrobase/pipeline/MockTransform.java | 0 .../pipeline/operator/MBOperatorTest.java | 0 .../runtime/MacroBaseApplicationTest.java | 0 .../command/MacroBaseMockPipeline.java | 0 .../command/MacroBasePipelineCommandTest.java | 0 .../src/test/java/macrobase/util/Drainer.java | 0 .../src/test/resources/conf/simple.yaml | 0 .../src/test/resources/data/missingdata.csv | 0 .../src/test/resources/data/sensor10k.csv.gz | Bin .../src/test/resources/data/simple.csv | 0 lib/pom.xml | 147 +++ .../classify/PercentileClassifier.java | 117 ++ .../analysis/summary/BatchSummarizer.java | 133 +++ .../analysis/summary/Explanation.java | 68 ++ .../analysis/summary/count/ExactCount.java | 28 + .../summary/itemset/AttributeEncoder.java | 70 ++ .../analysis/summary/itemset/FPGrowth.java | 472 ++++++++ .../summary/itemset/FPGrowthEmerging.java | 184 +++ .../analysis/summary/itemset/RiskRatio.java | 56 + .../summary/itemset/result/AttributeSet.java | 75 ++ .../summary/itemset/result/ItemsetResult.java | 46 + .../itemset/result/ItemsetWithCount.java | 32 + .../macrobase/datamodel/DataFrame.java | 317 +++++ .../futuredata/macrobase/datamodel/Row.java | 69 ++ .../macrobase/datamodel/Schema.java | 71 ++ .../macrobase/ingest/CSVDataFrameLoader.java | 86 ++ .../macrobase/ingest/DataFrameLoader.java | 11 + .../macrobase/operator/Operator.java | 6 + .../macrobase/operator/Transformer.java | 6 + .../macrobase/util/MacrobaseException.java | 7 + .../util/MacrobaseInternalError.java | 7 + .../macrobase/SupervisedEventTest.java | 121 ++ .../macrobase/UnsupervisedCSVTest.java | 126 ++ .../classify/PercentileClassifierTest.java | 65 ++ .../summary/count/ExactCountTest.java | 30 + .../analysis/summary/itemset/Apriori.java | 141 +++ .../summary/itemset/AttributeEncoderTest.java | 46 + .../summary/itemset/FPGrowthTest.java | 171 +++ .../summary/itemset/RiskRatioTest.java | 35 + .../macrobase/datamodel/DataFrameTest.java | 46 + .../macrobase/datamodel/RowTest.java | 40 + .../macrobase/datamodel/SchemaTest.java | 19 + .../ingest/DataFrameCSVLoaderTest.java | 31 + lib/src/test/resources/sample.csv | 1021 +++++++++++++++++ lib/src/test/resources/tiny.csv | 4 + pom.xml | 51 +- 156 files changed, 3972 insertions(+), 44 deletions(-) rename {core => legacy}/pom.xml (75%) rename {core => legacy}/src/main/java/macrobase/MacroBase.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/classify/OutlierClassifier.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/BasePipeline.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/Pipeline.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/result/AnalysisResult.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/sample/AChao.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/Autocorrelation.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/BatchTrainScore.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/Covariance.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/FFT.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/Gaussian.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/MAD.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/MinCovDet.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/RandomProjection.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/Truncate.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/Winsorizer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/stats/ZScore.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/BatchSummarizer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/Summarizer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/Summary.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/count/ExactCount.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/Apriori.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/transform/FeatureTransform.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java (100%) rename {core => legacy}/src/main/java/macrobase/analysis/transform/LowMetricTransform.java (100%) rename {core => legacy}/src/main/java/macrobase/conf/ConfigurationException.java (100%) rename {core => legacy}/src/main/java/macrobase/conf/MacroBaseConf.java (100%) rename {core => legacy}/src/main/java/macrobase/conf/MacroBaseDefaults.java (100%) rename {core => legacy}/src/main/java/macrobase/conf/MissingParameterException.java (100%) rename {core => legacy}/src/main/java/macrobase/datamodel/Datum.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/CSVIngester.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/DataIngester.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/DatumEncoder.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/DiskCachingIngester.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/MySQLIngester.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/PostgresIngester.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/SQLIngester.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/result/ColumnValue.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/result/RowSet.java (100%) rename {core => legacy}/src/main/java/macrobase/ingest/result/Schema.java (100%) rename {core => legacy}/src/main/java/macrobase/runtime/MacroBaseApplication.java (100%) rename {core => legacy}/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java (100%) rename {core => legacy}/src/main/java/macrobase/util/CheckedSupplier.java (100%) rename {core => legacy}/src/main/java/macrobase/util/Periodic.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/sample/AChaoTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/FFTTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/GaussianTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/MADTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/MinCovDetTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/TruncateTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/WinsorizerTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/stats/ZScoreTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java (100%) rename {core => legacy}/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java (100%) rename {core => legacy}/src/test/java/macrobase/conf/MacroBaseConfTest.java (100%) rename {core => legacy}/src/test/java/macrobase/conf/MockIngester.java (100%) rename {core => legacy}/src/test/java/macrobase/ingest/CSVIngesterTest.java (100%) rename {core => legacy}/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java (100%) rename {core => legacy}/src/test/java/macrobase/ingest/DatumEncoderTest.java (100%) rename {core => legacy}/src/test/java/macrobase/ingest/SQLIngesterTest.java (100%) rename {core => legacy}/src/test/java/macrobase/pipeline/BasePipelineTest.java (100%) rename {core => legacy}/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java (100%) rename {core => legacy}/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java (100%) rename {core => legacy}/src/test/java/macrobase/pipeline/MockTransform.java (100%) rename {core => legacy}/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java (100%) rename {core => legacy}/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java (100%) rename {core => legacy}/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java (100%) rename {core => legacy}/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java (100%) rename {core => legacy}/src/test/java/macrobase/util/Drainer.java (100%) rename {core => legacy}/src/test/resources/conf/simple.yaml (100%) rename {core => legacy}/src/test/resources/data/missingdata.csv (100%) rename {core => legacy}/src/test/resources/data/sensor10k.csv.gz (100%) rename {core => legacy}/src/test/resources/data/simple.csv (100%) create mode 100644 lib/pom.xml create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Row.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Schema.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/CSVDataFrameLoader.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/DataFrameLoader.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Operator.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Transformer.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseException.java create mode 100644 lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseInternalError.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/SupervisedEventTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/UnsupervisedCSVTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifierTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCountTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/Apriori.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoderTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatioTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/DataFrameTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/RowTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/SchemaTest.java create mode 100644 lib/src/test/java/edu/stanford/futuredata/macrobase/ingest/DataFrameCSVLoaderTest.java create mode 100644 lib/src/test/resources/sample.csv create mode 100644 lib/src/test/resources/tiny.csv diff --git a/.travis.yml b/.travis.yml index d866439e2..2130674ed 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,6 @@ language: java jdk: oraclejdk8 +install: mvn install -DskipTests=true -Dgpg.skip=true -Dmaven.javadoc.skip=true -B -V notifications: slack: stanford-futuredata:qmO6Keu8ifOyXHsmSQ97CeLH after_success: diff --git a/assembly/pom.xml b/assembly/pom.xml index f8cd84746..5f48f25b5 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -2,7 +2,7 @@ 4.0.0 - macrobase + edu.stanford.futuredata macrobase 0.1-SNAPSHOT @@ -12,17 +12,17 @@ - macrobase - macrobase-core + edu.stanford.futuredata + macrobase-legacy 0.1-SNAPSHOT - macrobase + edu.stanford.futuredata macrobase-frontend 0.1-SNAPSHOT - macrobase + edu.stanford.futuredata macrobase-contrib 0.1-SNAPSHOT diff --git a/bin/batch.sh b/bin/batch.sh index 758bbb63f..ad0ca20a6 100755 --- a/bin/batch.sh +++ b/bin/batch.sh @@ -2,4 +2,4 @@ conf_file=${1:-"conf/batch.yaml"} set -x -java ${JAVA_OPTS} -cp "core/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file +java ${JAVA_OPTS} -cp "legacy/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file diff --git a/bin/server.sh b/bin/server.sh index 11fba3ee1..a663ec786 100755 --- a/bin/server.sh +++ b/bin/server.sh @@ -3,4 +3,4 @@ conf_file=${1:-"conf/macrobase.yaml"} set -x -java ${JAVA_OPTS} -cp "core/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.runtime.MacroBaseServer server $conf_file +java ${JAVA_OPTS} -cp "legacy/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.runtime.MacroBaseServer server $conf_file diff --git a/bin/streaming.sh b/bin/streaming.sh index db2576767..16c89b880 100755 --- a/bin/streaming.sh +++ b/bin/streaming.sh @@ -2,4 +2,4 @@ conf_file=${1:-"conf/streaming.yaml"} set -x -java ${JAVA_OPTS} -cp "core/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file +java ${JAVA_OPTS} -cp "legacy/target/classes:frontend/target/classes:frontend/src/main/resources/:contrib/target/classes:assembly/target/*:$CLASSPATH" macrobase.MacroBase pipeline $conf_file diff --git a/contrib/pom.xml b/contrib/pom.xml index a87448244..f0b42c04a 100644 --- a/contrib/pom.xml +++ b/contrib/pom.xml @@ -1,22 +1,22 @@ 4.0.0 - macrobase + edu.stanford.futuredata macrobase-contrib jar 0.1-SNAPSHOT macrobase-contrib http://maven.apache.org - macrobase + edu.stanford.futuredata macrobase 0.1-SNAPSHOT - macrobase - macrobase-core + edu.stanford.futuredata + macrobase-legacy 0.1-SNAPSHOT diff --git a/frontend/pom.xml b/frontend/pom.xml index 1f1f535c6..be2ab8939 100644 --- a/frontend/pom.xml +++ b/frontend/pom.xml @@ -1,22 +1,22 @@ 4.0.0 - macrobase + edu.stanford.futuredata macrobase-frontend jar 0.1-SNAPSHOT macrobase-frontend http://maven.apache.org - macrobase + edu.stanford.futuredata macrobase 0.1-SNAPSHOT - macrobase - macrobase-core + edu.stanford.futuredata + macrobase-legacy 0.1-SNAPSHOT diff --git a/core/pom.xml b/legacy/pom.xml similarity index 75% rename from core/pom.xml rename to legacy/pom.xml index f498e55b9..ed21d9ec6 100644 --- a/core/pom.xml +++ b/legacy/pom.xml @@ -1,14 +1,14 @@ 4.0.0 - macrobase - macrobase-core + edu.stanford.futuredata + macrobase-legacy jar 0.1-SNAPSHOT - macrobase-core + macrobase-legacy http://maven.apache.org - macrobase + edu.stanford.futuredata macrobase 0.1-SNAPSHOT @@ -61,6 +61,26 @@ 1.1.1 test + + io.dropwizard + dropwizard-core + ${dropwizard.version} + + + io.dropwizard + dropwizard-assets + ${dropwizard.version} + + + io.dropwizard + dropwizard-logging + ${dropwizard.version} + + + io.dropwizard + dropwizard-db + ${dropwizard.version} + diff --git a/core/src/main/java/macrobase/MacroBase.java b/legacy/src/main/java/macrobase/MacroBase.java similarity index 100% rename from core/src/main/java/macrobase/MacroBase.java rename to legacy/src/main/java/macrobase/MacroBase.java diff --git a/core/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java similarity index 100% rename from core/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java rename to legacy/src/main/java/macrobase/analysis/classify/BatchingPercentileClassifier.java diff --git a/core/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java similarity index 100% rename from core/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java rename to legacy/src/main/java/macrobase/analysis/classify/EWAppxPercentileOutlierClassifier.java diff --git a/core/src/main/java/macrobase/analysis/classify/OutlierClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/OutlierClassifier.java similarity index 100% rename from core/src/main/java/macrobase/analysis/classify/OutlierClassifier.java rename to legacy/src/main/java/macrobase/analysis/classify/OutlierClassifier.java diff --git a/core/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java b/legacy/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java similarity index 100% rename from core/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java rename to legacy/src/main/java/macrobase/analysis/classify/StaticThresholdClassifier.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/BasePipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/BasePipeline.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/BasePipeline.java rename to legacy/src/main/java/macrobase/analysis/pipeline/BasePipeline.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java rename to legacy/src/main/java/macrobase/analysis/pipeline/BasicBatchedPipeline.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java rename to legacy/src/main/java/macrobase/analysis/pipeline/BasicOneShotEWStreamingPipeline.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/Pipeline.java b/legacy/src/main/java/macrobase/analysis/pipeline/Pipeline.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/Pipeline.java rename to legacy/src/main/java/macrobase/analysis/pipeline/Pipeline.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBConsumer.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBGroupBy.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBOperator.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java b/legacy/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java rename to legacy/src/main/java/macrobase/analysis/pipeline/operator/MBProducer.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java b/legacy/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java rename to legacy/src/main/java/macrobase/analysis/pipeline/stream/MBMultiInputStream.java diff --git a/core/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java b/legacy/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java similarity index 100% rename from core/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java rename to legacy/src/main/java/macrobase/analysis/pipeline/stream/MBStream.java diff --git a/core/src/main/java/macrobase/analysis/result/AnalysisResult.java b/legacy/src/main/java/macrobase/analysis/result/AnalysisResult.java similarity index 100% rename from core/src/main/java/macrobase/analysis/result/AnalysisResult.java rename to legacy/src/main/java/macrobase/analysis/result/AnalysisResult.java diff --git a/core/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java b/legacy/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java similarity index 100% rename from core/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java rename to legacy/src/main/java/macrobase/analysis/result/OutlierClassificationResult.java diff --git a/core/src/main/java/macrobase/analysis/sample/AChao.java b/legacy/src/main/java/macrobase/analysis/sample/AChao.java similarity index 100% rename from core/src/main/java/macrobase/analysis/sample/AChao.java rename to legacy/src/main/java/macrobase/analysis/sample/AChao.java diff --git a/core/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java b/legacy/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java similarity index 100% rename from core/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java rename to legacy/src/main/java/macrobase/analysis/sample/FlexibleDampedReservoir.java diff --git a/core/src/main/java/macrobase/analysis/stats/Autocorrelation.java b/legacy/src/main/java/macrobase/analysis/stats/Autocorrelation.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/Autocorrelation.java rename to legacy/src/main/java/macrobase/analysis/stats/Autocorrelation.java diff --git a/core/src/main/java/macrobase/analysis/stats/BatchTrainScore.java b/legacy/src/main/java/macrobase/analysis/stats/BatchTrainScore.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/BatchTrainScore.java rename to legacy/src/main/java/macrobase/analysis/stats/BatchTrainScore.java diff --git a/core/src/main/java/macrobase/analysis/stats/Covariance.java b/legacy/src/main/java/macrobase/analysis/stats/Covariance.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/Covariance.java rename to legacy/src/main/java/macrobase/analysis/stats/Covariance.java diff --git a/core/src/main/java/macrobase/analysis/stats/FFT.java b/legacy/src/main/java/macrobase/analysis/stats/FFT.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/FFT.java rename to legacy/src/main/java/macrobase/analysis/stats/FFT.java diff --git a/core/src/main/java/macrobase/analysis/stats/Gaussian.java b/legacy/src/main/java/macrobase/analysis/stats/Gaussian.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/Gaussian.java rename to legacy/src/main/java/macrobase/analysis/stats/Gaussian.java diff --git a/core/src/main/java/macrobase/analysis/stats/MAD.java b/legacy/src/main/java/macrobase/analysis/stats/MAD.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/MAD.java rename to legacy/src/main/java/macrobase/analysis/stats/MAD.java diff --git a/core/src/main/java/macrobase/analysis/stats/MinCovDet.java b/legacy/src/main/java/macrobase/analysis/stats/MinCovDet.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/MinCovDet.java rename to legacy/src/main/java/macrobase/analysis/stats/MinCovDet.java diff --git a/core/src/main/java/macrobase/analysis/stats/RandomProjection.java b/legacy/src/main/java/macrobase/analysis/stats/RandomProjection.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/RandomProjection.java rename to legacy/src/main/java/macrobase/analysis/stats/RandomProjection.java diff --git a/core/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java b/legacy/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java rename to legacy/src/main/java/macrobase/analysis/stats/RobustEmpiricalCovariance.java diff --git a/core/src/main/java/macrobase/analysis/stats/Truncate.java b/legacy/src/main/java/macrobase/analysis/stats/Truncate.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/Truncate.java rename to legacy/src/main/java/macrobase/analysis/stats/Truncate.java diff --git a/core/src/main/java/macrobase/analysis/stats/Winsorizer.java b/legacy/src/main/java/macrobase/analysis/stats/Winsorizer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/Winsorizer.java rename to legacy/src/main/java/macrobase/analysis/stats/Winsorizer.java diff --git a/core/src/main/java/macrobase/analysis/stats/ZScore.java b/legacy/src/main/java/macrobase/analysis/stats/ZScore.java similarity index 100% rename from core/src/main/java/macrobase/analysis/stats/ZScore.java rename to legacy/src/main/java/macrobase/analysis/stats/ZScore.java diff --git a/core/src/main/java/macrobase/analysis/summary/BatchSummarizer.java b/legacy/src/main/java/macrobase/analysis/summary/BatchSummarizer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/BatchSummarizer.java rename to legacy/src/main/java/macrobase/analysis/summary/BatchSummarizer.java diff --git a/core/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java b/legacy/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java rename to legacy/src/main/java/macrobase/analysis/summary/EWStreamingSummarizer.java diff --git a/core/src/main/java/macrobase/analysis/summary/Summarizer.java b/legacy/src/main/java/macrobase/analysis/summary/Summarizer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/Summarizer.java rename to legacy/src/main/java/macrobase/analysis/summary/Summarizer.java diff --git a/core/src/main/java/macrobase/analysis/summary/Summary.java b/legacy/src/main/java/macrobase/analysis/summary/Summary.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/Summary.java rename to legacy/src/main/java/macrobase/analysis/summary/Summary.java diff --git a/core/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java b/legacy/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java rename to legacy/src/main/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounter.java diff --git a/core/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java b/legacy/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java rename to legacy/src/main/java/macrobase/analysis/summary/count/ApproximateCount.java diff --git a/core/src/main/java/macrobase/analysis/summary/count/ExactCount.java b/legacy/src/main/java/macrobase/analysis/summary/count/ExactCount.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/count/ExactCount.java rename to legacy/src/main/java/macrobase/analysis/summary/count/ExactCount.java diff --git a/core/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java b/legacy/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java rename to legacy/src/main/java/macrobase/analysis/summary/count/SpaceSavingList.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/Apriori.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/Apriori.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/Apriori.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/Apriori.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/ExponentiallyDecayingEmergingItemsets.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowth.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/FPGrowthEmerging.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/RiskRatio.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/StreamingFPGrowth.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetResult.java diff --git a/core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java b/legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java similarity index 100% rename from core/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java rename to legacy/src/main/java/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java diff --git a/core/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java b/legacy/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java similarity index 100% rename from core/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java rename to legacy/src/main/java/macrobase/analysis/transform/BatchScoreFeatureTransform.java diff --git a/core/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java b/legacy/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java similarity index 100% rename from core/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java rename to legacy/src/main/java/macrobase/analysis/transform/EWFeatureTransform.java diff --git a/core/src/main/java/macrobase/analysis/transform/FeatureTransform.java b/legacy/src/main/java/macrobase/analysis/transform/FeatureTransform.java similarity index 100% rename from core/src/main/java/macrobase/analysis/transform/FeatureTransform.java rename to legacy/src/main/java/macrobase/analysis/transform/FeatureTransform.java diff --git a/core/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java b/legacy/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java similarity index 100% rename from core/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java rename to legacy/src/main/java/macrobase/analysis/transform/LinearMetricNormalizer.java diff --git a/core/src/main/java/macrobase/analysis/transform/LowMetricTransform.java b/legacy/src/main/java/macrobase/analysis/transform/LowMetricTransform.java similarity index 100% rename from core/src/main/java/macrobase/analysis/transform/LowMetricTransform.java rename to legacy/src/main/java/macrobase/analysis/transform/LowMetricTransform.java diff --git a/core/src/main/java/macrobase/conf/ConfigurationException.java b/legacy/src/main/java/macrobase/conf/ConfigurationException.java similarity index 100% rename from core/src/main/java/macrobase/conf/ConfigurationException.java rename to legacy/src/main/java/macrobase/conf/ConfigurationException.java diff --git a/core/src/main/java/macrobase/conf/MacroBaseConf.java b/legacy/src/main/java/macrobase/conf/MacroBaseConf.java similarity index 100% rename from core/src/main/java/macrobase/conf/MacroBaseConf.java rename to legacy/src/main/java/macrobase/conf/MacroBaseConf.java diff --git a/core/src/main/java/macrobase/conf/MacroBaseDefaults.java b/legacy/src/main/java/macrobase/conf/MacroBaseDefaults.java similarity index 100% rename from core/src/main/java/macrobase/conf/MacroBaseDefaults.java rename to legacy/src/main/java/macrobase/conf/MacroBaseDefaults.java diff --git a/core/src/main/java/macrobase/conf/MissingParameterException.java b/legacy/src/main/java/macrobase/conf/MissingParameterException.java similarity index 100% rename from core/src/main/java/macrobase/conf/MissingParameterException.java rename to legacy/src/main/java/macrobase/conf/MissingParameterException.java diff --git a/core/src/main/java/macrobase/datamodel/Datum.java b/legacy/src/main/java/macrobase/datamodel/Datum.java similarity index 100% rename from core/src/main/java/macrobase/datamodel/Datum.java rename to legacy/src/main/java/macrobase/datamodel/Datum.java diff --git a/core/src/main/java/macrobase/ingest/CSVIngester.java b/legacy/src/main/java/macrobase/ingest/CSVIngester.java similarity index 100% rename from core/src/main/java/macrobase/ingest/CSVIngester.java rename to legacy/src/main/java/macrobase/ingest/CSVIngester.java diff --git a/core/src/main/java/macrobase/ingest/DataIngester.java b/legacy/src/main/java/macrobase/ingest/DataIngester.java similarity index 100% rename from core/src/main/java/macrobase/ingest/DataIngester.java rename to legacy/src/main/java/macrobase/ingest/DataIngester.java diff --git a/core/src/main/java/macrobase/ingest/DatumEncoder.java b/legacy/src/main/java/macrobase/ingest/DatumEncoder.java similarity index 100% rename from core/src/main/java/macrobase/ingest/DatumEncoder.java rename to legacy/src/main/java/macrobase/ingest/DatumEncoder.java diff --git a/core/src/main/java/macrobase/ingest/DiskCachingIngester.java b/legacy/src/main/java/macrobase/ingest/DiskCachingIngester.java similarity index 100% rename from core/src/main/java/macrobase/ingest/DiskCachingIngester.java rename to legacy/src/main/java/macrobase/ingest/DiskCachingIngester.java diff --git a/core/src/main/java/macrobase/ingest/MySQLIngester.java b/legacy/src/main/java/macrobase/ingest/MySQLIngester.java similarity index 100% rename from core/src/main/java/macrobase/ingest/MySQLIngester.java rename to legacy/src/main/java/macrobase/ingest/MySQLIngester.java diff --git a/core/src/main/java/macrobase/ingest/PostgresIngester.java b/legacy/src/main/java/macrobase/ingest/PostgresIngester.java similarity index 100% rename from core/src/main/java/macrobase/ingest/PostgresIngester.java rename to legacy/src/main/java/macrobase/ingest/PostgresIngester.java diff --git a/core/src/main/java/macrobase/ingest/SQLIngester.java b/legacy/src/main/java/macrobase/ingest/SQLIngester.java similarity index 100% rename from core/src/main/java/macrobase/ingest/SQLIngester.java rename to legacy/src/main/java/macrobase/ingest/SQLIngester.java diff --git a/core/src/main/java/macrobase/ingest/result/ColumnValue.java b/legacy/src/main/java/macrobase/ingest/result/ColumnValue.java similarity index 100% rename from core/src/main/java/macrobase/ingest/result/ColumnValue.java rename to legacy/src/main/java/macrobase/ingest/result/ColumnValue.java diff --git a/core/src/main/java/macrobase/ingest/result/RowSet.java b/legacy/src/main/java/macrobase/ingest/result/RowSet.java similarity index 100% rename from core/src/main/java/macrobase/ingest/result/RowSet.java rename to legacy/src/main/java/macrobase/ingest/result/RowSet.java diff --git a/core/src/main/java/macrobase/ingest/result/Schema.java b/legacy/src/main/java/macrobase/ingest/result/Schema.java similarity index 100% rename from core/src/main/java/macrobase/ingest/result/Schema.java rename to legacy/src/main/java/macrobase/ingest/result/Schema.java diff --git a/core/src/main/java/macrobase/runtime/MacroBaseApplication.java b/legacy/src/main/java/macrobase/runtime/MacroBaseApplication.java similarity index 100% rename from core/src/main/java/macrobase/runtime/MacroBaseApplication.java rename to legacy/src/main/java/macrobase/runtime/MacroBaseApplication.java diff --git a/core/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java b/legacy/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java similarity index 100% rename from core/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java rename to legacy/src/main/java/macrobase/runtime/command/MacroBasePipelineCommand.java diff --git a/core/src/main/java/macrobase/util/CheckedSupplier.java b/legacy/src/main/java/macrobase/util/CheckedSupplier.java similarity index 100% rename from core/src/main/java/macrobase/util/CheckedSupplier.java rename to legacy/src/main/java/macrobase/util/CheckedSupplier.java diff --git a/core/src/main/java/macrobase/util/Periodic.java b/legacy/src/main/java/macrobase/util/Periodic.java similarity index 100% rename from core/src/main/java/macrobase/util/Periodic.java rename to legacy/src/main/java/macrobase/util/Periodic.java diff --git a/core/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java b/legacy/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java rename to legacy/src/test/java/macrobase/analysis/classify/StaticThresholdClassifierTest.java diff --git a/core/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java b/legacy/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java rename to legacy/src/test/java/macrobase/analysis/pipeline/operator/MBGroupByTest.java diff --git a/core/src/test/java/macrobase/analysis/sample/AChaoTest.java b/legacy/src/test/java/macrobase/analysis/sample/AChaoTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/sample/AChaoTest.java rename to legacy/src/test/java/macrobase/analysis/sample/AChaoTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java b/legacy/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java rename to legacy/src/test/java/macrobase/analysis/stats/AutocorrelationTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/FFTTest.java b/legacy/src/test/java/macrobase/analysis/stats/FFTTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/FFTTest.java rename to legacy/src/test/java/macrobase/analysis/stats/FFTTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/GaussianTest.java b/legacy/src/test/java/macrobase/analysis/stats/GaussianTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/GaussianTest.java rename to legacy/src/test/java/macrobase/analysis/stats/GaussianTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/MADTest.java b/legacy/src/test/java/macrobase/analysis/stats/MADTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/MADTest.java rename to legacy/src/test/java/macrobase/analysis/stats/MADTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/MinCovDetTest.java b/legacy/src/test/java/macrobase/analysis/stats/MinCovDetTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/MinCovDetTest.java rename to legacy/src/test/java/macrobase/analysis/stats/MinCovDetTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java b/legacy/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java rename to legacy/src/test/java/macrobase/analysis/stats/RandomProjectionTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java b/legacy/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java rename to legacy/src/test/java/macrobase/analysis/stats/RobustEmpiricalCovarianceTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/TruncateTest.java b/legacy/src/test/java/macrobase/analysis/stats/TruncateTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/TruncateTest.java rename to legacy/src/test/java/macrobase/analysis/stats/TruncateTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/WinsorizerTest.java b/legacy/src/test/java/macrobase/analysis/stats/WinsorizerTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/WinsorizerTest.java rename to legacy/src/test/java/macrobase/analysis/stats/WinsorizerTest.java diff --git a/core/src/test/java/macrobase/analysis/stats/ZScoreTest.java b/legacy/src/test/java/macrobase/analysis/stats/ZScoreTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/stats/ZScoreTest.java rename to legacy/src/test/java/macrobase/analysis/stats/ZScoreTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java b/legacy/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java rename to legacy/src/test/java/macrobase/analysis/summary/count/AmortizedMaintenanceCounterTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java b/legacy/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java rename to legacy/src/test/java/macrobase/analysis/summary/count/ExactCountTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java b/legacy/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java rename to legacy/src/test/java/macrobase/analysis/summary/count/SpaceSavingTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java rename to legacy/src/test/java/macrobase/analysis/summary/itemset/AprioriTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java rename to legacy/src/test/java/macrobase/analysis/summary/itemset/FPGrowthTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java rename to legacy/src/test/java/macrobase/analysis/summary/itemset/RiskRatioTest.java diff --git a/core/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java b/legacy/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java rename to legacy/src/test/java/macrobase/analysis/summary/itemset/StreamingFPGrowthTest.java diff --git a/core/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java b/legacy/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java rename to legacy/src/test/java/macrobase/analysis/transform/LinearMetricNormalizerTest.java diff --git a/core/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java b/legacy/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java similarity index 100% rename from core/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java rename to legacy/src/test/java/macrobase/analysis/transform/LowMetricTransformTest.java diff --git a/core/src/test/java/macrobase/conf/MacroBaseConfTest.java b/legacy/src/test/java/macrobase/conf/MacroBaseConfTest.java similarity index 100% rename from core/src/test/java/macrobase/conf/MacroBaseConfTest.java rename to legacy/src/test/java/macrobase/conf/MacroBaseConfTest.java diff --git a/core/src/test/java/macrobase/conf/MockIngester.java b/legacy/src/test/java/macrobase/conf/MockIngester.java similarity index 100% rename from core/src/test/java/macrobase/conf/MockIngester.java rename to legacy/src/test/java/macrobase/conf/MockIngester.java diff --git a/core/src/test/java/macrobase/ingest/CSVIngesterTest.java b/legacy/src/test/java/macrobase/ingest/CSVIngesterTest.java similarity index 100% rename from core/src/test/java/macrobase/ingest/CSVIngesterTest.java rename to legacy/src/test/java/macrobase/ingest/CSVIngesterTest.java diff --git a/core/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java b/legacy/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java similarity index 100% rename from core/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java rename to legacy/src/test/java/macrobase/ingest/CachingSQLIngesterTest.java diff --git a/core/src/test/java/macrobase/ingest/DatumEncoderTest.java b/legacy/src/test/java/macrobase/ingest/DatumEncoderTest.java similarity index 100% rename from core/src/test/java/macrobase/ingest/DatumEncoderTest.java rename to legacy/src/test/java/macrobase/ingest/DatumEncoderTest.java diff --git a/core/src/test/java/macrobase/ingest/SQLIngesterTest.java b/legacy/src/test/java/macrobase/ingest/SQLIngesterTest.java similarity index 100% rename from core/src/test/java/macrobase/ingest/SQLIngesterTest.java rename to legacy/src/test/java/macrobase/ingest/SQLIngesterTest.java diff --git a/core/src/test/java/macrobase/pipeline/BasePipelineTest.java b/legacy/src/test/java/macrobase/pipeline/BasePipelineTest.java similarity index 100% rename from core/src/test/java/macrobase/pipeline/BasePipelineTest.java rename to legacy/src/test/java/macrobase/pipeline/BasePipelineTest.java diff --git a/core/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java b/legacy/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java similarity index 100% rename from core/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java rename to legacy/src/test/java/macrobase/pipeline/BasicBatchedPipelineTest.java diff --git a/core/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java b/legacy/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java similarity index 100% rename from core/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java rename to legacy/src/test/java/macrobase/pipeline/BasicOneShotEWStreamingPipelineTest.java diff --git a/core/src/test/java/macrobase/pipeline/MockTransform.java b/legacy/src/test/java/macrobase/pipeline/MockTransform.java similarity index 100% rename from core/src/test/java/macrobase/pipeline/MockTransform.java rename to legacy/src/test/java/macrobase/pipeline/MockTransform.java diff --git a/core/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java b/legacy/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java similarity index 100% rename from core/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java rename to legacy/src/test/java/macrobase/pipeline/operator/MBOperatorTest.java diff --git a/core/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java b/legacy/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java similarity index 100% rename from core/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java rename to legacy/src/test/java/macrobase/runtime/MacroBaseApplicationTest.java diff --git a/core/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java b/legacy/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java similarity index 100% rename from core/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java rename to legacy/src/test/java/macrobase/runtime/command/MacroBaseMockPipeline.java diff --git a/core/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java b/legacy/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java similarity index 100% rename from core/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java rename to legacy/src/test/java/macrobase/runtime/command/MacroBasePipelineCommandTest.java diff --git a/core/src/test/java/macrobase/util/Drainer.java b/legacy/src/test/java/macrobase/util/Drainer.java similarity index 100% rename from core/src/test/java/macrobase/util/Drainer.java rename to legacy/src/test/java/macrobase/util/Drainer.java diff --git a/core/src/test/resources/conf/simple.yaml b/legacy/src/test/resources/conf/simple.yaml similarity index 100% rename from core/src/test/resources/conf/simple.yaml rename to legacy/src/test/resources/conf/simple.yaml diff --git a/core/src/test/resources/data/missingdata.csv b/legacy/src/test/resources/data/missingdata.csv similarity index 100% rename from core/src/test/resources/data/missingdata.csv rename to legacy/src/test/resources/data/missingdata.csv diff --git a/core/src/test/resources/data/sensor10k.csv.gz b/legacy/src/test/resources/data/sensor10k.csv.gz similarity index 100% rename from core/src/test/resources/data/sensor10k.csv.gz rename to legacy/src/test/resources/data/sensor10k.csv.gz diff --git a/core/src/test/resources/data/simple.csv b/legacy/src/test/resources/data/simple.csv similarity index 100% rename from core/src/test/resources/data/simple.csv rename to legacy/src/test/resources/data/simple.csv diff --git a/lib/pom.xml b/lib/pom.xml new file mode 100644 index 000000000..46997ad75 --- /dev/null +++ b/lib/pom.xml @@ -0,0 +1,147 @@ + + + 4.0.0 + edu.stanford.futuredata + macrobase-lib + 0.1-SNAPSHOT + macrobase + https://github.com/stanford-futuredata/macrobase + + MacroBase is an anomaly detection engine designed to prioritize human attention + in large-scale datasets and data streams. + macrobase-lib provides operators for standalone API usage of classifiers and summarizers. + + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + Peter Bailis + pbailis@cs.stanford.edu + Stanford + http://www.bailis.org + + + + scm:git:git://github.com/stanford-futuredata/macrobase.git + scm:git:ssh://github.com/stanford-futuredata/macrobase.git + https://github.com/stanford-futuredata/macrobase + + + + + + + junit + junit + 4.12 + test + + + com.google.guava + guava + 21.0 + + + org.apache.commons + commons-math3 + 3.6 + + + org.apache.commons + commons-csv + 1.2 + + + + + + ossrh + https://oss.sonatype.org/content/repositories/snapshots + + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.3 + + 1.8 + 1.8 + + + + + + + + release + + + + org.apache.maven.plugins + maven-source-plugin + 3.0.1 + + + attach-sources + verify + + jar-no-fork + + + + + + org.apache.maven.plugins + maven-javadoc-plugin + 2.10.4 + + + attach-javadocs + verify + + jar + + + + + + org.apache.maven.plugins + maven-gpg-plugin + 1.5 + + + sign-artifacts + deploy + + sign + + + + + + org.sonatype.plugins + nexus-staging-maven-plugin + 1.6.7 + true + + ossrh + https://oss.sonatype.org/ + false + + + + + + + + \ No newline at end of file diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java new file mode 100644 index 000000000..ad13f039b --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifier.java @@ -0,0 +1,117 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.operator.Transformer; +import org.apache.commons.math3.stat.descriptive.rank.Percentile; + +/** + * Classify rows based on high / low values for a single column. + * Returns a new dataframe with a column representation the classification status for + * each row: 1.0 if outlier, 0.0 otherwise. + */ +public class PercentileClassifier implements Transformer { + // Parameters + private double percentile = 0.5; + private boolean includeHigh = true; + private boolean includeLow = true; + private String columnName; + private String outputColumnName = "_OUTLIER"; + + // Calculated values + private double lowCutoff; + private double highCutoff; + private DataFrame output; + + public PercentileClassifier(String columnName) { + this.columnName = columnName; + } + + @Override + public void process(DataFrame input) { + double[] metrics = input.getDoubleColumnByName(columnName); + int len = metrics.length; + lowCutoff = new Percentile().evaluate(metrics, percentile); + highCutoff = new Percentile().evaluate(metrics, 100.0 - percentile); + + output = input.copy(); + double[] resultColumn = new double[len]; + for (int i = 0; i < len; i++) { + double curVal = metrics[i]; + if ((curVal > highCutoff && includeHigh) + || (curVal < lowCutoff && includeLow) + ) { + resultColumn[i] = 1.0; + } + } + output.addDoubleColumn(outputColumnName, resultColumn); + } + + @Override + public DataFrame getResults() { + return output; + } + + // Parameter Getters and Setters + public double getPercentile() { + return percentile; + } + + /** + * @param percentile Cutoff point for high or low values + * @return this + */ + public PercentileClassifier setPercentile(double percentile) { + this.percentile = percentile; + return this; + } + public boolean isIncludeHigh() { + return includeHigh; + } + + /** + * @param includeHigh Whether to count high points as outliers. + * @return this + */ + public PercentileClassifier setIncludeHigh(boolean includeHigh) { + this.includeHigh = includeHigh; + return this; + } + public boolean isIncludeLow() { + return includeLow; + } + + /** + * @param includeLow Whether to count low points as outliers + * @return this + */ + public PercentileClassifier setIncludeLow(boolean includeLow) { + this.includeLow = includeLow; + return this; + } + public String getColumnName() { + return columnName; + } + public PercentileClassifier setColumnName(String columnName) { + this.columnName = columnName; + return this; + } + public String getOutputColumnName() { + return outputColumnName; + } + + /** + * @param outputColumnName Which column to write the classification results. + * @return this + */ + public PercentileClassifier setOutputColumnName(String outputColumnName) { + this.outputColumnName = outputColumnName; + return this; + } + + public double getLowCutoff() { + return lowCutoff; + } + public double getHighCutoff() { + return highCutoff; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java new file mode 100644 index 000000000..c9fec3dc3 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java @@ -0,0 +1,133 @@ +package edu.stanford.futuredata.macrobase.analysis.summary; + +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.AttributeEncoder; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.FPGrowthEmerging; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetResult; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import edu.stanford.futuredata.macrobase.operator.Operator; + +import java.util.ArrayList; +import java.util.List; +import java.util.Set; +import java.util.function.DoublePredicate; + +/** + * Given a batch of rows with an outlier class column, explain the outliers using + * string attribute columns. Each batch is considered as an independent unit. + */ +public class BatchSummarizer implements Operator { + // Parameters + private String outlierColumn = "_OUTLIER"; + private double minOutlierSupport = 0.1; + private double minRiskRatio = 3; + private boolean useAttributeCombinations = true; + private List attributes = new ArrayList<>(); + private DoublePredicate predicate = d -> d != 0.0; + + // Output + private Explanation explanation = null; + // Encoder + private AttributeEncoder encoder = new AttributeEncoder(); + private List> inlierItemsets, outlierItemsets; + private FPGrowthEmerging fpg = new FPGrowthEmerging(); + + public BatchSummarizer() { } + + /** + * Adjust this to tune the significance (e.g. number of rows affected) of the results returned. + * @param minSupport lowest outlier support of the results returned. + * @return this + */ + public BatchSummarizer setMinSupport(double minSupport) { + this.minOutlierSupport = minSupport; + return this; + } + + /** + * Adjust this to tune the severity (e.g. strength of correlation) of the results returned. + * @param minRiskRatio lowest risk ratio to consider for meaningful explanations. + * @return this + */ + public BatchSummarizer setMinRiskRatio(double minRiskRatio) { + this.minRiskRatio = minRiskRatio; + return this; + } + + /** + * By default, will check for nonzero entries in a column of doubles. + * @param predicate function to signify whether row should be treated as outlier. + * @return this + */ + public BatchSummarizer setOutlierPredicate(DoublePredicate predicate) { + this.predicate = predicate; + return this; + } + public BatchSummarizer setAttributes(List attributes) { + this.attributes = attributes; + this.encoder.setColumnNames(attributes); + return this; + } + + /** + * Set the column which indicates outlier status. "_OUTLIER" by default. + * @param outlierColumn new outlier indicator column. + * @return this + */ + public BatchSummarizer setOutlierColumn(String outlierColumn) { + this.outlierColumn = outlierColumn; + return this; + } + + /** + * Whether or not to use combinations of attributes in explanation, or only + * use simple single attribute explanations + * @param useAttributeCombinations flag + * @return this + */ + public BatchSummarizer setUseAttributeCombinations(boolean useAttributeCombinations) { + this.useAttributeCombinations = useAttributeCombinations; + fpg.setCombinationsEnabled(useAttributeCombinations); + return this; + } + + @Override + public void process(DataFrame df) { + // Filter inliers and outliers + DataFrame outlierDF = df.filter(outlierColumn, predicate); + DataFrame inlierDF = df.filter(outlierColumn, predicate.negate()); + + // Encode inlier and outlier attribute columns + if (attributes.isEmpty()) { + encoder.setColumnNames(df.getSchema().getColumnNamesByType(Schema.ColType.STRING)); + inlierItemsets = encoder.encodeAttributes(inlierDF.getStringCols()); + outlierItemsets = encoder.encodeAttributes(outlierDF.getStringCols()); + } else { + encoder.setColumnNames(attributes); + inlierItemsets = encoder.encodeAttributes(inlierDF.getStringColsByName(attributes)); + outlierItemsets = encoder.encodeAttributes(outlierDF.getStringColsByName(attributes)); + } + + long startTime = System.currentTimeMillis(); + List itemsetResults = fpg.getEmergingItemsetsWithMinSupport( + inlierItemsets, + outlierItemsets, + minOutlierSupport, + minRiskRatio); + // Decode results + List attributeSets = new ArrayList<>(); + itemsetResults.forEach(i -> attributeSets.add(new AttributeSet(i, encoder))); + long elapsed = System.currentTimeMillis() - startTime; + + explanation = new Explanation(attributeSets, + inlierItemsets.size(), + outlierItemsets.size(), + elapsed); + } + + @Override + public Explanation getResults() { + return explanation; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java new file mode 100644 index 000000000..c201e11cb --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/Explanation.java @@ -0,0 +1,68 @@ +package edu.stanford.futuredata.macrobase.analysis.summary; + +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet; + +import java.util.List; + +/** + * Represents a summarization result, which contains a list of attribute values + * and other statistics about the underlying process, e.g. num of tuples observed + * so far. + */ +public class Explanation { + private final long numOutliers; + private final long numInliers; + private List itemsets; + private final long creationTimeMs; + + public Explanation(List resultList, + long numInliers, + long numOutliers, + long creationTimeMs) { + itemsets = resultList; + this.numInliers = numInliers; + this.numOutliers = numOutliers; + this.creationTimeMs = creationTimeMs; + } + + public List getItemsets() { + return itemsets; + } + + public long getNumOutliers() { + return numOutliers; + } + + public long getNumInliers() { + return numInliers; + } + + public long getCreationTimeMs() { + return creationTimeMs; + } + + public String prettyPrint() { + StringBuilder header = new StringBuilder(String.format( + "Outlier Explanation:\n" + + "numOutliers: %d\n" + + "numInliners: %d\n" + + "Itemsets: \n" + + "--------\n", + numOutliers, + numInliers, + itemsets)); + for (AttributeSet is : itemsets) { + header.append(is.prettyPrint()); + } + return header.toString(); + } + + @Override + public String toString() { + return "Explanation{" + + "numOutliers=" + numOutliers + + ", numInliers=" + numInliers + + ", itemsets=" + itemsets + + '}'; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java new file mode 100644 index 000000000..55efbc71c --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCount.java @@ -0,0 +1,28 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.count; + +import java.util.HashMap; +import java.util.List; +import java.util.Set; + + +public class ExactCount { + private HashMap counts = new HashMap<>(); + + public HashMap getCounts() { + return counts; + } + + public ExactCount count(List> transactions) { + for (Set txn : transactions) { + for (int i : txn) { + Double curVal = counts.get(i); + if (curVal == null) { + curVal = 0.; + } + counts.put(i, curVal + 1); + } + } + + return this; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java new file mode 100644 index 000000000..d401fda8c --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoder.java @@ -0,0 +1,70 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import java.util.*; + +/** + * Encode every combination of attribute names and values into a distinct integer. + * This class assumes that attributes are stored in String columns in dataframes + * and is mainly used for frequent itemset mining. + */ +public class AttributeEncoder { + private HashMap> encoder; + private int nextKey; + + private HashMap valueDecoder; + private HashMap columnDecoder; + private List colNames; + + public AttributeEncoder() { + encoder = new HashMap<>(); + nextKey = 0; + valueDecoder = new HashMap<>(); + columnDecoder = new HashMap<>(); + } + public void setColumnNames(List colNames) { + this.colNames = colNames; + } + + public int decodeColumn(int i) {return columnDecoder.get(i);} + public String decodeColumnName(int i) {return colNames.get(columnDecoder.get(i));} + public String decodeValue(int i) {return valueDecoder.get(i);} + + public List> encodeAttributes(List columns) { + if (columns.isEmpty()) { + return new ArrayList<>(); + } + + int d = columns.size(); + int numRows = columns.get(0).length; + + for (int i = 0; i < d; i++) { + if (!encoder.containsKey(i)) { + encoder.put(i, new HashMap<>()); + } + } + + ArrayList> encodedAttributes = new ArrayList<>(numRows); + for (int i = 0; i < numRows; i++) { + encodedAttributes.add(new HashSet<>()); + } + + for (int colIdx = 0; colIdx < d; colIdx++) { + Map curColEncoder = encoder.get(colIdx); + String[] curCol = columns.get(colIdx); + for (int rowIdx = 0; rowIdx < numRows; rowIdx++) { + String colVal = curCol[rowIdx]; + if (!curColEncoder.containsKey(colVal)) { + curColEncoder.put(colVal, nextKey); + valueDecoder.put(nextKey, colVal); + columnDecoder.put(nextKey, colIdx); + nextKey++; + } + int curKey = curColEncoder.get(colVal); + encodedAttributes.get(rowIdx).add(curKey); + } + } + + return encodedAttributes; + } + +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java new file mode 100644 index 000000000..59a311807 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowth.java @@ -0,0 +1,472 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import com.google.common.collect.Sets; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetWithCount; + +import java.util.*; +import java.util.stream.Collectors; + + +public class FPGrowth { + class FPTree { + private FPTreeNode root = new FPTreeNode(-1, null, 0); + // used to calculate the order + private Map frequentItemCounts = new HashMap<>(); + + // item order -- need canonical to break ties; 0 is smallest, N is largest + private Map frequentItemOrder = new HashMap<>(); + + protected Map nodeHeaders = new HashMap<>(); + +// protected void printTreeDebug() { +// log.debug("Frequent Item Counts:"); +// frequentItemCounts.entrySet().forEach(e -> log.debug(String.format("%d: %f", e.getKey(), e.getValue()))); +// +// walkTree(root, 1); +// } + +// private void walkTree(FPTreeNode start, int treeDepth) { +// log.debug(String.format("%s node: %d, count: %f", +// new String(new char[treeDepth]).replaceAll("\0", "\t"), +// start.getItem(), start.getCount())); +// if (start.getChildren() != null) { +// for (FPTreeNode child : start.getChildren()) { +// walkTree(child, treeDepth + 1); +// } +// } +// } + + private class FPTreeNode { + private int item; + private double count; + private FPTreeNode nextLink; + private FPTreeNode parent; + private List children; + + public FPTreeNode(int item, FPTreeNode parent, int initialCount) { + this.item = item; + this.parent = parent; + this.count = initialCount; + } + + public int getItem() { + return item; + } + + public double getCount() { + return count; + } + + public void incrementCount(double by) { + count += by; + } + + public void setNextLink(FPTreeNode nextLink) { + this.nextLink = nextLink; + } + + public FPTreeNode getNextLink() { + return nextLink; + } + + public FPTreeNode getParent() { + return parent; + } + + + public List getChildren() { + return children; + } + + // insert the transaction at this node starting with transaction[currentIndex] + // then find the child that matches + public void insertTransaction(List fullTransaction, + int currentIndex, + final double transactionCount) { + incrementCount(transactionCount); + + if (currentIndex == fullTransaction.size()) { + return; + } + + int currentItem = fullTransaction.get(currentIndex); + + FPTreeNode matchingChild = null; + + if (children != null) { + for (FPTreeNode child : children) { + if (child.getItem() == currentItem) { + matchingChild = child; + break; + } + } + } + + if (matchingChild == null) { + matchingChild = new FPTreeNode(currentItem, this, 0); + + FPTreeNode prevHeader = nodeHeaders.get(currentItem); + nodeHeaders.put(currentItem, matchingChild); + + if (prevHeader != null) { + matchingChild.setNextLink(prevHeader); + } + + if (children == null) { + children = new ArrayList<>(); + } + + children.add(matchingChild); + } + + matchingChild.insertTransaction(fullTransaction, currentIndex + 1, transactionCount); + } + } + + public void setFrequentCounts(Map counts) { + frequentItemCounts = counts; + sortFrequentItems(); + } + + public void insertFrequentItems(List> transactions, + int countRequiredForSupport) { + + Map itemCounts = new HashMap<>(); + for (Set t : transactions) { + for (Integer item : t) { + itemCounts.compute(item, (k, v) -> v == null ? 1 : v + 1); + } + } + + for (Map.Entry e : itemCounts.entrySet()) { + if (e.getValue() >= countRequiredForSupport) { + frequentItemCounts.put(e.getKey(), e.getValue()); + } + } + + sortFrequentItems(); + } + + private void sortFrequentItems() { + // we have to materialize a canonical order so that items with equal counts + // are consistently ordered when they are sorted during transaction insertion + List> sortedItemCounts = new ArrayList<>(frequentItemCounts.entrySet()); + sortedItemCounts.sort((i1, i2) -> frequentItemCounts.get(i1.getKey()) + .compareTo(frequentItemCounts.get(i2.getKey()))); + for (int i = 0; i < sortedItemCounts.size(); ++i) { + frequentItemOrder.put(sortedItemCounts.get(i).getKey(), i); + } + } + + public void insertConditionalFrequentItems(List patterns, + int countRequiredForSupport) { + Map itemCounts = new HashMap<>(); + + for (ItemsetWithCount i : patterns) { + for (Integer item : i.getItems()) { + itemCounts.compute(item, (k, v) -> v == null ? i.getCount() : v + i.getCount()); + } + } + + for (Map.Entry e : itemCounts.entrySet()) { + if (e.getValue() >= countRequiredForSupport) { + frequentItemCounts.put(e.getKey(), e.getValue()); + } + } + + // we have to materialize a canonical order so that items with equal counts + // are consistently ordered when they are sorted during transaction insertion + List> sortedItemCounts = new ArrayList<>(frequentItemCounts.entrySet()); + sortedItemCounts.sort((i1, i2) -> frequentItemCounts.get(i1.getKey()) + .compareTo(frequentItemCounts.get(i2.getKey()))); + for (int i = 0; i < sortedItemCounts.size(); ++i) { + frequentItemOrder.put(sortedItemCounts.get(i).getKey(), i); + } + } + + public void insertConditionalFrequentPatterns(List patterns) { + for (ItemsetWithCount is : patterns) { + List filtered = is.getItems().stream().filter(i -> frequentItemCounts.containsKey(i)).collect( + Collectors.toList()); + filtered.sort((i1, i2) -> frequentItemOrder.get(i2).compareTo(frequentItemOrder.get(i1))); + root.insertTransaction(filtered, 0, is.getCount()); + } + } + + public void insertTransactions(List> transactions) { + for (Set t : transactions) { + List filtered = t.stream().filter(i -> frequentItemCounts.containsKey(i)).collect( + Collectors.toList()); + + if (!filtered.isEmpty()) { + filtered.sort((i1, i2) -> frequentItemOrder.get(i2).compareTo(frequentItemOrder.get(i1))); + root.insertTransaction(filtered, 0, 1); + } + } + } + + public int getSupport(Set pattern) { + for (Integer i : pattern) { + if (!frequentItemCounts.containsKey(i)) { + return 0; + } + } + + List plist = new ArrayList<>(pattern); + // traverse bottom to top + plist.sort((i1, i2) -> frequentItemOrder.get(i1).compareTo(frequentItemOrder.get(i2))); + + int count = 0; + FPTreeNode pathHead = nodeHeaders.get(plist.get(0)); + while (pathHead != null) { + FPTreeNode curNode = pathHead; + int itemsToFind = plist.size(); + + while (curNode != null) { + if (pattern.contains(curNode.getItem())) { + itemsToFind -= 1; + } + + if (itemsToFind == 0) { + count += pathHead.count; + break; + } + + curNode = curNode.getParent(); + } + pathHead = pathHead.getNextLink(); + } + + return count; + } + + + List mineItemsets(Integer supportCountRequired) { + List singlePathItemsets = new ArrayList<>(); + List branchingItemsets = new ArrayList<>(); + + // mine single-path itemsets first + FPTreeNode curNode = root; + FPTreeNode nodeOfBranching = null; + Set singlePathNodes = new HashSet<>(); + while (true) { + if (curNode.children != null && curNode.children.size() > 1) { + nodeOfBranching = curNode; + break; + } + + if (curNode != root) { + singlePathNodes.add(curNode); + } + + if (curNode.children == null || curNode.children.size() == 0) { + break; + } else { + curNode = curNode.children.get(0); + } + } + + for (Set subset : Sets.powerSet(singlePathNodes)) { + if (subset.isEmpty()) { + continue; + } + + double minSupportInSubset = -1; + Set items = new HashSet<>(); + for (FPTreeNode n : subset) { + items.add(n.getItem()); + + if (minSupportInSubset == -1 || n.getCount() < minSupportInSubset) { + minSupportInSubset = n.getCount(); + } + } + + assert (minSupportInSubset >= supportCountRequired); + singlePathItemsets.add(new ItemsetWithCount(items, minSupportInSubset)); + } + + // the entire tree was a single path... + if (nodeOfBranching == null) { + return singlePathItemsets; + } + + // all of the items in the single path will have been mined now + // due to the descending frequency count of the FPTree structure, so + // we remove them from consideration in the rest + + // instead of destructively removing the nodes from NodeHeader table + // which would be valid but would make mining non-idempotent, we + // instead store the nodes to skip in a separate set + + Set alreadyMinedItems = new HashSet<>(); + for (FPTreeNode node : singlePathNodes) { + alreadyMinedItems.add(node.getItem()); + } + + for (Map.Entry header : nodeHeaders.entrySet()) { + if (alreadyMinedItems.contains(header.getKey())) { + continue; + } + + // add the singleton item set + branchingItemsets.add(new ItemsetWithCount(Sets.newHashSet(header.getKey()), + frequentItemCounts.get(header.getKey()))); + + List conditionalPatternBase = new ArrayList<>(); + + // walk each "leaf" node + FPTreeNode conditionalNode = header.getValue(); + while (conditionalNode != null) { + final double leafSupport = conditionalNode.getCount(); + + // walk the tree up to the branch node + Set conditionalPattern = new HashSet<>(); + FPTreeNode walkNode = conditionalNode.getParent(); + while (walkNode != nodeOfBranching.getParent() && walkNode != root) { + conditionalPattern.add(walkNode.getItem()); + walkNode = walkNode.getParent(); + } + + if (conditionalPattern.size() > 0) { + conditionalPatternBase.add(new ItemsetWithCount(conditionalPattern, leafSupport)); + } + + conditionalNode = conditionalNode.getNextLink(); + } + + if (conditionalPatternBase.isEmpty()) { + continue; + } + + // build and mine the conditional FPTree + FPTree conditionalTree = new FPTree(); + conditionalTree.insertConditionalFrequentItems(conditionalPatternBase, supportCountRequired); + conditionalTree.insertConditionalFrequentPatterns(conditionalPatternBase); + List conditionalFrequentItemsets = conditionalTree.mineItemsets(supportCountRequired); + + if (!conditionalFrequentItemsets.isEmpty()) { + for (ItemsetWithCount is : conditionalFrequentItemsets) { + is.getItems().add(header.getKey()); + } + + branchingItemsets.addAll(conditionalFrequentItemsets); + } + } + + if (singlePathItemsets.isEmpty()) { + return branchingItemsets; + } + + // take the cross product of the mined itemsets + List ret = new ArrayList<>(); + + ret.addAll(singlePathItemsets); + ret.addAll(branchingItemsets); + + for (ItemsetWithCount i : singlePathItemsets) { + for (ItemsetWithCount j : branchingItemsets) { + Set combinedItems = new HashSet<>(); + combinedItems.addAll(i.getItems()); + combinedItems.addAll(j.getItems()); + + ret.add(new ItemsetWithCount(combinedItems, Math.min(i.getCount(), j.getCount()))); + } + } + + return ret; + } + } + + + public List getItemsetsWithSupportRatio(List> transactions, + Double supportRatio) { + return getItemsetsWithSupportRatio(transactions, null, supportRatio); + } + + public List getItemsetsWithSupportRatio(List> transactions, + Map initialCounts, + Double supportRatio) { + return getItemsetsWithSupportCount(transactions, initialCounts, supportRatio * transactions.size()); + } + + public List getItemsetsWithSupportCount(List> transactions, + Double supportCount) { + return getItemsetsWithSupportCount(transactions, null, supportCount); + + } + + public List getItemsetsWithSupportCount(List> transactions, + Map initialCounts, + Double supportCount) { + return getItemsetsWithSupportCount(transactions, initialCounts, supportCount, false); + } + + protected FPTree constructTree(List> transactions, int supportCount) { + FPTree fp = new FPTree(); + fp.insertFrequentItems(transactions, supportCount); + fp.insertTransactions(transactions); + return fp; + } + + public List getItemsetsWithSupportCount(List> transactions, + Map initialCounts, + Double supportCount, + boolean printTreeDebug) { + FPTree fp = new FPTree(); + int countRequiredForSupport = supportCount.intValue(); +// log.debug("count required: {}", countRequiredForSupport); + + long st = System.currentTimeMillis(); + + if (initialCounts == null) { + fp.insertFrequentItems(transactions, countRequiredForSupport); + } else { + fp.setFrequentCounts(initialCounts); + } + + fp.insertFrequentItems(transactions, countRequiredForSupport); + fp.insertTransactions(transactions); + long en = System.currentTimeMillis(); + +// log.debug("FPTree load: {}", en - st); + + //fp.printTreeDebug(); + + st = System.currentTimeMillis(); + List ret = fp.mineItemsets(countRequiredForSupport); + en = System.currentTimeMillis(); + +// log.debug("FPTree mine: {}", en - st); + + return ret; + } + + // ugh, this is a really ugly function sig, but it's efficient + public List getCounts( + List> transactions, + Map initialCounts, + Set targetItems, + List toCount) { + FPTree countTree = new FPTree(); + + Map frequentCounts = new HashMap<>(); + + for (Integer i : targetItems) { + Double initialCount = initialCounts.get(i); + if (initialCount == null) { + initialCount = 0.; + } + frequentCounts.put(i, initialCount); + } + + countTree.setFrequentCounts(frequentCounts); + countTree.insertTransactions(transactions); + + List ret = new ArrayList<>(); + for (ItemsetWithCount c : toCount) { + ret.add(new ItemsetWithCount(c.getItems(), countTree.getSupport(c.getItems()))); + } + + return ret; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java new file mode 100644 index 000000000..02669039c --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthEmerging.java @@ -0,0 +1,184 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import com.google.common.collect.Sets; +import edu.stanford.futuredata.macrobase.analysis.summary.count.ExactCount; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetResult; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetWithCount; + +import java.util.*; + + +public class FPGrowthEmerging { + private boolean combinationsEnabled = true; + + public FPGrowthEmerging() {}; + public FPGrowthEmerging setCombinationsEnabled(boolean flag) { + this.combinationsEnabled = flag; + return this; + } + + + private List getSingletonItemsets(List> inliers, + List> outliers, + double minSupport, + double minRatio) { + int supportCountRequired = (int) (outliers.size() * minSupport); + + List ret = new ArrayList<>(); + + Map inlierCounts = new ExactCount().count(inliers).getCounts(); + Map outlierCounts = new ExactCount().count(outliers).getCounts(); + + for (Map.Entry attrOutlierCountEntry : outlierCounts.entrySet()) { + if (attrOutlierCountEntry.getValue() < supportCountRequired) { + continue; + } + + int item = attrOutlierCountEntry.getKey(); + Double attrInlierCount = inlierCounts.get(item); + + double ratio = RiskRatio.compute(attrInlierCount, + attrOutlierCountEntry.getValue(), + inliers.size(), + outliers.size()); + + if (ratio > minRatio) { + ret.add(new ItemsetResult( + attrOutlierCountEntry.getValue() / outliers.size(), + attrOutlierCountEntry.getValue(), + ratio, + Collections.singleton(item) + )); + } + } + return ret; + } + + public List getEmergingItemsetsWithMinSupport(List> inliers, + List> outliers, + double minSupport, + double minRatio) { + if (!combinationsEnabled || (inliers.size() > 0 && inliers.get(0).size() == 1)) { + return getSingletonItemsets(inliers, outliers, minSupport, minRatio); + } + + ArrayList> outlierTransactions = new ArrayList<>(); + + Map inlierCounts = new ExactCount().count(inliers).getCounts(); + Map outlierCounts = new ExactCount().count(outliers).getCounts(); + + Map supportedOutlierCounts = new HashMap<>(); + + int supportCountRequired = (int) (outliers.size() * minSupport); + + for (Set o: outliers) { + Set txn = null; + + for (int i : o) { + double outlierCount = outlierCounts.get(i); + if (outlierCount >= supportCountRequired) { + Number inlierCount = inlierCounts.get(i); + + double outlierInlierRatio = RiskRatio.compute(inlierCount, + outlierCount, + inliers.size(), + outliers.size()); + + if (outlierInlierRatio > minRatio) { + if (txn == null) { + txn = new HashSet<>(); + } + + if (!supportedOutlierCounts.containsKey(i)) { + supportedOutlierCounts.put(i, outlierCount); + } + + txn.add(i); + } + } + } + + if (txn != null) { + outlierTransactions.add(txn); + } + } + + FPGrowth fpg = new FPGrowth(); + List iwc = fpg.getItemsetsWithSupportCount( + outlierTransactions, + supportedOutlierCounts, + outliers.size() * minSupport); + + iwc.sort((x, y) -> x.getCount() != y.getCount() ? + -Double.compare(x.getCount(), y.getCount()) : + -Double.compare(x.getItems().size(), y.getItems().size())); + + Set ratioItemsToCheck = new HashSet<>(); + List ratioSetsToCheck = new ArrayList<>(); + List ret = new ArrayList<>(); + + Set prevSet = null; + Double prevCount = -1.; + for (ItemsetWithCount i : iwc) { + if (i.getCount() == prevCount) { + if (prevSet != null && Sets.difference(i.getItems(), prevSet).size() == 0) { + continue; + } + } + + + prevCount = i.getCount(); + prevSet = i.getItems(); + + if (i.getItems().size() == 1) { + Number inlierCount = inlierCounts.get(i.getItems().iterator().next()); + + double ratio = RiskRatio.compute(inlierCount, + i.getCount(), + inliers.size(), + outliers.size()); + + ret.add(new ItemsetResult(i.getCount() / (double) outliers.size(), + i.getCount(), + ratio, + i.getItems())); + } else { + ratioItemsToCheck.addAll(i.getItems()); + ratioSetsToCheck.add(i); + } + } + + // check the ratios of any itemsets we just marked + FPGrowth inlierTree = new FPGrowth(); + List matchingInlierCounts = inlierTree.getCounts(inliers, + inlierCounts, + ratioItemsToCheck, + ratioSetsToCheck); + + assert (matchingInlierCounts.size() == ratioSetsToCheck.size()); + for (int i = 0; i < matchingInlierCounts.size(); ++i) { + ItemsetWithCount ic = matchingInlierCounts.get(i); + ItemsetWithCount oc = ratioSetsToCheck.get(i); + + double ratio = RiskRatio.compute(ic.getCount(), + oc.getCount(), + inliers.size(), + outliers.size()); + + if (ratio >= minRatio) { + ret.add(new ItemsetResult(oc.getCount() / (double) outliers.size(), + oc.getCount(), + ratio, + oc.getItems())); + } + } + + + // finally sort one last time + ret.sort((x, y) -> x.getNumRecords() != y.getNumRecords() ? + -Double.compare(x.getNumRecords(), y.getNumRecords()) : + -Double.compare(x.getItems().size(), y.getItems().size())); + + return ret; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java new file mode 100644 index 000000000..c4c9ff75e --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatio.java @@ -0,0 +1,56 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +public class RiskRatio { + private static double computeDouble(double exposedInlierCount, + double exposedOutlierCount, + double totalInliers, + double totalOutliers) { + double totalExposedCount = exposedInlierCount + exposedOutlierCount; + double unexposedOutlierCount = (totalOutliers - exposedOutlierCount); + double totalMinusExposedCount = totalInliers + totalOutliers - totalExposedCount; + + // no exposure occurred + if (totalExposedCount == 0) { + return 0; + } + + // we only exposed this ratio, everything matched! + if (totalMinusExposedCount == 0) { + return 0; + } + + // all outliers had this pattern + if (unexposedOutlierCount == 0) { + return Double.POSITIVE_INFINITY; + } + + return (exposedOutlierCount / totalExposedCount) / + (unexposedOutlierCount / totalMinusExposedCount); + } + + public static double compute(Number exposedInlierCount, + Number exposedOutlierCount, + Number totalInliers, + Number totalOutliers) { + if(exposedInlierCount == null) { + exposedInlierCount = 0.; + } + + if(exposedOutlierCount == null) { + exposedOutlierCount = 0.; + } + + if(totalInliers == null) { + totalInliers = 0.; + } + + if(totalOutliers == null) { + totalOutliers = 0.; + } + + return computeDouble(exposedInlierCount.doubleValue(), + exposedOutlierCount.doubleValue(), + totalInliers.doubleValue(), + totalOutliers.doubleValue()); + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java new file mode 100644 index 000000000..01922f657 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/AttributeSet.java @@ -0,0 +1,75 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset.result; + +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.AttributeEncoder; + +import java.util.HashMap; +import java.util.Map; +import java.util.StringJoiner; + +public class AttributeSet { + private double support; + private long numRecords; + private double ratioToInliers; + private Map items = new HashMap<>(); + + public AttributeSet(ItemsetResult its, AttributeEncoder encoder) { + this.support = its.getSupport(); + this.numRecords = (long)its.getNumRecords(); + this.ratioToInliers = its.getRatioToInliers(); + its.getItems().forEach(i -> items.put(encoder.decodeColumnName(i), encoder.decodeValue(i))); + } + + public AttributeSet(double support, + double numRecords, + double ratioToInliers, + Map items) { + this.support = support; + this.numRecords = (long)numRecords; + this.ratioToInliers = ratioToInliers; + this.items = items; + } + + public String prettyPrint() { + StringJoiner joiner = new StringJoiner("\n"); + items.forEach((k, v) -> joiner.add(k+"="+v)); + + return String.format("support: %f\n" + + "records: %d\n" + + "ratio: %f\n" + + "\nColumns:\n%s\n\n", + support, + numRecords, + ratioToInliers, + joiner.toString()); + } + + public double getSupport() { + return support; + } + + public double getNumRecords() { + return numRecords; + } + + public double getRatioToInliers() { + return ratioToInliers; + } + + public void setRatioToInliers(double ratio) { + ratioToInliers = ratio; + } + + public Map getItems() { + return items; + } + + @Override + public String toString() { + return "AttributeSet{" + + "support=" + support + + ", numRecords=" + numRecords + + ", ratioToInliers=" + ratioToInliers + + ", items=" + items + + '}'; + } +} \ No newline at end of file diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java new file mode 100644 index 000000000..6370b0beb --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetResult.java @@ -0,0 +1,46 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset.result; + +import java.util.Set; + +public class ItemsetResult { + private double support; + private double numRecords; + private double ratioToInliers; + private Set items; + + public ItemsetResult(double support, + double numRecords, + double ratioToInliers, + Set items) { + this.support = support; + this.numRecords = numRecords; + this.ratioToInliers = ratioToInliers; + this.items = items; + } + + public double getSupport() { + return support; + } + + public double getNumRecords() { + return numRecords; + } + + public double getRatioToInliers() { + return ratioToInliers; + } + + public Set getItems() { + return items; + } + + @Override + public String toString() { + return "ItemsetResult{" + + "support=" + support + + ", numRecords=" + numRecords + + ", ratioToInliers=" + ratioToInliers + + ", items=" + items + + '}'; + } +} \ No newline at end of file diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java new file mode 100644 index 000000000..a603154b3 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/result/ItemsetWithCount.java @@ -0,0 +1,32 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset.result; + +import java.util.Set; + +public class ItemsetWithCount { + private Set items; + private double count; + + public ItemsetWithCount(Set items, double count) { + this.items = items; + this.count = count; + } + + public Set getItems() { + return items; + } + + public double getCount() { + return count; + } + + @Override + public boolean equals(Object o) { + if (o == null) { + return false; + } else if (!(o instanceof ItemsetWithCount)) { + return false; + } + final ItemsetWithCount other = (ItemsetWithCount) o; + return (Math.round(other.getCount()) == Math.round(count)) && (other.getItems().equals(items)); + } +} \ No newline at end of file diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java new file mode 100644 index 000000000..9d51efba2 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java @@ -0,0 +1,317 @@ +package edu.stanford.futuredata.macrobase.datamodel; + +import edu.stanford.futuredata.macrobase.util.MacrobaseInternalError; + +import java.util.ArrayList; +import java.util.List; +import java.util.function.DoublePredicate; +import java.util.function.Predicate; + +/** + * Column-based dataframe object. + * loadRows and addColumn methods mutate the dataframe and are the primary + * ways of initializing the data in the dataframe. + */ +public class DataFrame { + private Schema schema; + + private ArrayList stringCols; + private ArrayList doubleCols; + private ArrayList indexToTypeIndex; + + private int numRows; + + public DataFrame() { + this.schema = new Schema(); + this.stringCols = new ArrayList<>(); + this.doubleCols = new ArrayList<>(); + this.indexToTypeIndex = new ArrayList<>(); + this.numRows = 0; + } + + /** + * Creates a dataframe from a list of rows + * Slower than creating a dataframe column by column using addXColumn methods. + * @param schema Schema to use + * @param rows Data to load + */ + public DataFrame(Schema schema, List rows) { + this(); + this.schema = schema; + this.numRows = rows.size(); + int d = schema.getNumColumns(); + for (int c = 0; c < d; c++) { + Schema.ColType t = schema.getColumnType(c); + if (t == Schema.ColType.STRING) { + String[] colValues = new String[numRows]; + for (int i = 0; i < numRows; i++) { + colValues[i] = rows.get(i).getAs(c); + } + addStringColumnInternal(colValues); + } else if (t == Schema.ColType.DOUBLE) { + double[] colValues = new double[numRows]; + for (int i = 0; i < numRows; i++) { + colValues[i] = rows.get(i).getAs(c); + } + addDoubleColumnInternal(colValues); + } else { + throw new MacrobaseInternalError("Invalid ColType"); + } + } + } + + /** + * @return shallow copy of dataframe + */ + public DataFrame copy() { + DataFrame other = new DataFrame(); + other.schema = schema.copy(); + other.stringCols = new ArrayList<>(stringCols); + other.doubleCols = new ArrayList<>(doubleCols); + other.indexToTypeIndex = new ArrayList<>(indexToTypeIndex); + other.numRows = numRows; + return other; + } + + public Schema getSchema() {return this.schema;} + public int getNumRows() {return numRows;} + public ArrayList getDoubleCols() { return doubleCols; } + public ArrayList getStringCols() { return stringCols; } + + // Fast Column-based methods + private void addDoubleColumnInternal(double[] colValues) { + doubleCols.add(colValues); + indexToTypeIndex.add(doubleCols.size()-1); + } + public DataFrame addDoubleColumn(String colName, double[] colValues) { + if (numRows == 0) { + numRows = colValues.length; + } + schema.addColumn(Schema.ColType.DOUBLE, colName); + addDoubleColumnInternal(colValues); + return this; + } + private void addStringColumnInternal(String[] colValues) { + stringCols.add(colValues); + indexToTypeIndex.add(stringCols.size()-1); + } + public DataFrame addStringColumn(String colName, String[] colValues) { + if (numRows == 0) { + numRows = colValues.length; + } + schema.addColumn(Schema.ColType.STRING, colName); + addStringColumnInternal(colValues); + return this; + } + + protected int[] getSubIndices(List columns) { + int d = columns.size(); + int[] typeSubIndices = new int[d]; + for (int i = 0; i < d; i++) { + typeSubIndices[i] = indexToTypeIndex.get(columns.get(i)); + } + return typeSubIndices; + } + + public double[] getDoubleColumn(int columnIdx) { + return doubleCols.get(indexToTypeIndex.get(columnIdx)); + } + public double[] getDoubleColumnByName(String columnName) { + return doubleCols.get(indexToTypeIndex.get(schema.getColumnIndex(columnName))); + } + public ArrayList getDoubleCols(List columns) { + ArrayList cols = new ArrayList<>(); + for (int c : columns) { + cols.add(getDoubleColumn(c)); + } + return cols; + } + public ArrayList getDoubleColsByName(List columns) { + return getDoubleCols(this.schema.getColumnIndices(columns)); + } + public String[] getStringColumn(int columnIdx) { + return stringCols.get(indexToTypeIndex.get(columnIdx)); + } + public String[] getStringColumnByName(String columnName) { + return stringCols.get(indexToTypeIndex.get(schema.getColumnIndex(columnName))); + } + public ArrayList getStringCols(List columns) { + ArrayList cols = new ArrayList<>(); + for (int c : columns) { + cols.add(getStringColumn(c)); + } + return cols; + } + public ArrayList getStringColsByName(List columns) { + return getStringCols(this.schema.getColumnIndices(columns)); + } + + /** + * @param columns column indices to project + * @return new dataframe with subset of columns + */ + public DataFrame select(List columns) { + DataFrame other = new DataFrame(); + for (int c : columns) { + String columnName = schema.getColumnName(c); + Schema.ColType t = schema.getColumnType(c); + if (t == Schema.ColType.STRING) { + other.addStringColumn(columnName, getStringColumn(c)); + } else if (t == Schema.ColType.DOUBLE) { + other.addDoubleColumn(columnName, getDoubleColumn(c)); + } else { + throw new MacrobaseInternalError("Bad Column Type"); + } + } + return other; + } + + /** + * @param columns column names to project + * @return new dataframe with subset of columns + */ + public DataFrame selectByName(List columns) { + return select(this.schema.getColumnIndices(columns)); + } + + /** + * @param mask rows to select + * @return new dataframe with subset of rows + */ + protected DataFrame filter(boolean[] mask) { + DataFrame other = new DataFrame(); + + int d = schema.getNumColumns(); + int numTrue = 0; + for (int i = 0; i < numRows; i++) { + if (mask[i]) { + numTrue++; + } + } + for (int c = 0; c < d; c++) { + Schema.ColType t = schema.getColumnType(c); + String columnName = schema.getColumnName(c); + if (t == Schema.ColType.STRING) { + String[] oldColumn = getStringColumn(c); + String[] newColumn = new String[numTrue]; + int j = 0; + for (int i = 0; i < numRows; i++) { + if (mask[i]) { + newColumn[j] = oldColumn[i]; + j++; + } + } + other.addStringColumn(columnName, newColumn); + } else if (t == Schema.ColType.DOUBLE) { + double[] oldColumn = getDoubleColumn(c); + double[] newColumn = new double[numTrue]; + int j = 0; + for (int i = 0; i < numRows; i++) { + if (mask[i]) { + newColumn[j] = oldColumn[i]; + j++; + } + } + other.addDoubleColumn(columnName, newColumn); + } else { + throw new MacrobaseInternalError("Bad Column Type"); + } + } + return other; + } + public DataFrame filter(int columnIdx, Predicate filter) { + String[] filterColumn = getStringColumn(columnIdx); + boolean[] mask = new boolean[numRows]; + for (int i = 0; i < numRows; i++) { + mask[i] = filter.test(filterColumn[i]); + } + return filter(mask); + } + public DataFrame filter(String columnName, Predicate filter) { + return filter(schema.getColumnIndex(columnName), filter); + } + + /** + * @param columnIdx column index to filter by + * @param filter predicate to test each column value + * @return new dataframe with subset of rows + */ + public DataFrame filter(int columnIdx, DoublePredicate filter) { + double[] filterColumn = getDoubleColumn(columnIdx); + boolean[] mask = new boolean[numRows]; + for (int i = 0; i < numRows; i++) { + mask[i] = filter.test(filterColumn[i]); + } + return filter(mask); + } + + /** + * @param columnName column name to filter by + * @param filter predicate to test each column value + * @return new dataframe with subset of rows + */ + public DataFrame filter(String columnName, DoublePredicate filter) { + return filter(schema.getColumnIndex(columnName), filter); + } + + public Row getRow(int rowIdx) { + int d = schema.getNumColumns(); + ArrayList rowValues = new ArrayList<>(d); + for (int c = 0; c < d; c++) { + Schema.ColType t = schema.getColumnType(c); + int typeSubIndex = indexToTypeIndex.get(c); + if (t == Schema.ColType.STRING) { + rowValues.add(stringCols.get(typeSubIndex)[rowIdx]); + } else if (t == Schema.ColType.DOUBLE) { + rowValues.add(doubleCols.get(typeSubIndex)[rowIdx]); + } else { + throw new MacrobaseInternalError("Bad ColType"); + } + } + Row r = new Row(schema, rowValues); + return r; + } + public List getRows() { + List rows = new ArrayList<>(); + for (int rowIdx = 0; rowIdx < numRows; rowIdx++) { + rows.add(getRow(rowIdx)); + } + return rows; + } + public ArrayList getDoubleRows(List columns) { + ArrayList rows = new ArrayList<>(this.numRows); + int d = columns.size(); + int[] typeSubIndices = getSubIndices(columns); + + for (int i = 0; i < this.numRows; i++) { + double[] curRow = new double[d]; + for (int j = 0; j < d; j++) { + int colSubIndex = typeSubIndices[j]; + curRow[j] = doubleCols.get(colSubIndex)[i]; + } + rows.add(curRow); + } + return rows; + } + public ArrayList getStringRows(List columns) { + ArrayList rows = new ArrayList<>(this.numRows); + int d = columns.size(); + int[] typeSubIndices = getSubIndices(columns); + + for (int i = 0; i < this.numRows; i++) { + String[] curRow = new String[d]; + for (int j = 0; j < d; j++) { + int colSubIndex = typeSubIndices[j]; + curRow[j] = stringCols.get(colSubIndex)[i]; + } + rows.add(curRow); + } + return rows; + } + public ArrayList getDoubleRowsByName(List columns) { + return getDoubleRows(this.schema.getColumnIndices(columns)); + } + public ArrayList getStringRowsByName(List columns) { + return getStringRows(this.schema.getColumnIndices(columns)); + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Row.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Row.java new file mode 100644 index 000000000..349d8894b --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Row.java @@ -0,0 +1,69 @@ +package edu.stanford.futuredata.macrobase.datamodel; + +import java.util.ArrayList; +import java.util.List; + +/** + * Format for import / export small batches + */ +public class Row { + private Schema schema; // not set by user + private List vals; + + public Row(Schema schema, List vals) { + this.schema = schema; + this.vals = vals; + } + public Row(List vals) { + this.schema = null; + this.vals = vals; + } + + public List getVals() { + return this.vals; + } + + @SuppressWarnings("unchecked") + public T getAs(int i) { + return (T)vals.get(i); + } + + @SuppressWarnings("unchecked") + public T getAs(String colName) { + if (schema == null) { + throw new RuntimeException("No Schema"); + } else { + return (T)vals.get(schema.getColumnIndex(colName)); + } + } + + @Override + public boolean equals(Object o) { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + + Row row = (Row) o; + return vals != null ? vals.equals(row.vals) : row.vals == null; + } + + @Override + public int hashCode() { + return (vals != null ? vals.hashCode() : 0); + } + + @Override + public String toString() { + ArrayList strs = new ArrayList<>(vals.size()); + for (Object o : vals) { + String s = o.toString(); + if (o instanceof Double) { + if (s.length() > 7) { + double v = ((Double) o).doubleValue(); + s = String.format("%.6g", v); + } + } + strs.add(s); + } + return strs.toString(); + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Schema.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Schema.java new file mode 100644 index 000000000..7ed7733d9 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/Schema.java @@ -0,0 +1,71 @@ +package edu.stanford.futuredata.macrobase.datamodel; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; + +/** + * Provides column names, types, and order + */ +public class Schema { + public enum ColType { + STRING, + DOUBLE + } + private ArrayList columnNames; + private ArrayList columnTypes; + private HashMap columnIndices; + + public Schema() { + this.columnNames = new ArrayList<>(); + this.columnTypes = new ArrayList<>(); + this.columnIndices = new HashMap<>(); + } + public Schema copy() { + Schema other = new Schema(); + other.columnNames = new ArrayList<>(columnNames); + other.columnTypes = new ArrayList<>(columnTypes); + other.columnIndices = new HashMap<>(columnIndices); + return other; + } + + public int getNumColumns() { + return columnNames.size(); + } + public int getColumnIndex(String s) { + return columnIndices.get(s); + } + public ArrayList getColumnIndices(List columns) { + ArrayList indices = new ArrayList<>(columns.size()); + for (String colName: columns) { + indices.add(getColumnIndex(colName)); + } + return indices; + } + public String getColumnName(int i) { + return columnNames.get(i); + } + public List getColumnNamesByType(ColType type) { + List names = new ArrayList<>(); + for (int i = 0; i < columnNames.size(); i ++) { + if (getColumnType(i).equals(type)) { + names.add(getColumnName(i)); + } + } + return names; + } + public ColType getColumnType(int i) { + return columnTypes.get(i); + } + public ColType getColumnTypeByName(String s) { + return getColumnType(getColumnIndex(s)); + } + + public Schema addColumn(ColType t, String colName) { + int nextIdx = columnNames.size(); + this.columnNames.add(colName); + this.columnTypes.add(t); + this.columnIndices.put(colName, nextIdx); + return this; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/CSVDataFrameLoader.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/CSVDataFrameLoader.java new file mode 100644 index 000000000..3b091b338 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/CSVDataFrameLoader.java @@ -0,0 +1,86 @@ +package edu.stanford.futuredata.macrobase.ingest; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Row; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import org.apache.commons.csv.CSVFormat; +import org.apache.commons.csv.CSVParser; +import org.apache.commons.csv.CSVRecord; + +import java.io.File; +import java.nio.charset.Charset; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; + +public class CSVDataFrameLoader implements DataFrameLoader { + private String fileName; + private Map columnTypes; + private int badRecords; + + public CSVDataFrameLoader(String fileName){ + this.fileName = fileName; + this.columnTypes = new HashMap<>(); + } + @Override + public DataFrameLoader setColumnTypes(Map types) { + this.columnTypes = types; + return this; + } + + @Override + public DataFrame load() throws Exception { + File csvFile = new File(fileName); + CSVParser csvParser = CSVParser.parse( + csvFile, + Charset.defaultCharset(), + CSVFormat.DEFAULT.withHeader() + ); + + Map headerMap = csvParser.getHeaderMap(); + int numColumns = headerMap.size(); + + String[] columnNameList = new String[numColumns]; + Schema.ColType[] columnTypeList = new Schema.ColType[numColumns]; + for (String columnName: headerMap.keySet()) { + int columnIndex = headerMap.get(columnName); + Schema.ColType t = columnTypes.getOrDefault(columnName, Schema.ColType.STRING); + columnNameList[columnIndex] = columnName; + columnTypeList[columnIndex] = t; + } + // Make sure to generate the schema in the right order + Schema schema = new Schema(); + for (int c = 0; c < numColumns; c++) { + schema.addColumn(columnTypeList[c], columnNameList[c]); + } + + this.badRecords = 0; + ArrayList rows = new ArrayList<>(); + for (CSVRecord record : csvParser) { + try { + ArrayList rowFields = new ArrayList<>(numColumns); + for (int c = 0; c < numColumns; c++) { + Schema.ColType t = columnTypeList[c]; + String rowValue = record.get(c); + if (t == Schema.ColType.STRING) { + rowFields.add(rowValue); + } else if (t == Schema.ColType.DOUBLE) { + rowFields.add(Double.parseDouble(rowValue)); + } else { + throw new RuntimeException("Bad ColType"); + } + } + rows.add(new Row(rowFields)); + } catch (NumberFormatException e) { + this.badRecords++; + } + } + + DataFrame df = new DataFrame(schema, rows); + return df; + } + + public int getBadRecords() { + return badRecords; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/DataFrameLoader.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/DataFrameLoader.java new file mode 100644 index 000000000..66fa187c4 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/ingest/DataFrameLoader.java @@ -0,0 +1,11 @@ +package edu.stanford.futuredata.macrobase.ingest; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Schema; + +import java.util.Map; + +public interface DataFrameLoader { + DataFrameLoader setColumnTypes(Map types); + DataFrame load() throws Exception; +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Operator.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Operator.java new file mode 100644 index 000000000..d7e058d16 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Operator.java @@ -0,0 +1,6 @@ +package edu.stanford.futuredata.macrobase.operator; + +public interface Operator { + void process(I input) throws Exception; + O getResults(); +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Transformer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Transformer.java new file mode 100644 index 000000000..6b06285ea --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/operator/Transformer.java @@ -0,0 +1,6 @@ +package edu.stanford.futuredata.macrobase.operator; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; + +public interface Transformer extends Operator { +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseException.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseException.java new file mode 100644 index 000000000..213b4baa3 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseException.java @@ -0,0 +1,7 @@ +package edu.stanford.futuredata.macrobase.util; + +public class MacrobaseException extends Exception { + public MacrobaseException(String message) { + super(message); + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseInternalError.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseInternalError.java new file mode 100644 index 000000000..48ae3f5e1 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/util/MacrobaseInternalError.java @@ -0,0 +1,7 @@ +package edu.stanford.futuredata.macrobase.util; + +public class MacrobaseInternalError extends RuntimeException { + public MacrobaseInternalError(String message) { + super(message); + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/SupervisedEventTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/SupervisedEventTest.java new file mode 100644 index 000000000..4a7f17caa --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/SupervisedEventTest.java @@ -0,0 +1,121 @@ +package edu.stanford.futuredata.macrobase; + +import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer; +import edu.stanford.futuredata.macrobase.analysis.summary.Explanation; +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.AttributeSet; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Row; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import org.junit.Before; +import org.junit.Test; + +import java.util.*; +import java.util.function.Predicate; + +import static org.junit.Assert.assertEquals; + +public class SupervisedEventTest { + private List> events; + + @Before + public void setUp() { + events = new ArrayList<>(); + for (int i = 0; i < 900; i++) { + Map event = new HashMap<>(); + event.put("serverID", "s"+(i%20)); + event.put("region", "r"+(i%7)); + event.put("sev", "debug"); + events.add(event); + } + for (int i = 0; i < 100; i++) { + Map event = new HashMap<>(); + event.put("serverID", "s"+(i%2)); + event.put("region", "r3"); + event.put("sev", "error"); + events.add(event); + } + Collections.shuffle(events); + } + + class Explainer { + public List attributes; + public Predicate> isOutlier; + public static final String outlierColumn = "_OUTLIER"; + + public boolean useAttributeCombinations = true; + public double minSupport = 0.05; + public double minIORatio = 3.0; + + public Explainer(List attributes, Predicate> isOutlier) { + this.attributes = attributes; + this.isOutlier = isOutlier; + } + public Explainer setUseAttributeCombinations(boolean flag) { + this.useAttributeCombinations = flag; + return this; + } + public Explainer setMinSupport(double minSupport) { + this.minSupport = minSupport; + return this; + } + public Explainer setMinIORatio(double minIORatio) { + this.minIORatio = minIORatio; + return this; + } + + public DataFrame prepareBatch(List> events) throws Exception { + int n = events.size(); + Schema schema = new Schema(); + schema.addColumn(Schema.ColType.DOUBLE, Explainer.outlierColumn); + for (String attr: attributes) { + schema.addColumn(Schema.ColType.STRING, attr); + } + + List rows = new ArrayList<>(n); + for (Map event : events) { + List fields = new ArrayList<>(); + fields.add(isOutlier.test(event) ? 1.0 : 0.0); + for (String attr: attributes) { + fields.add(event.getOrDefault(attr, "MISSING").toString()); + } + rows.add(new Row(fields)); + } + + DataFrame df = new DataFrame(schema, rows); + return df; + } + + public Explanation predictBatch(DataFrame batch) throws Exception { + BatchSummarizer summ = new BatchSummarizer(); + summ.setAttributes(attributes); + summ.process(batch); + + return summ.getResults(); + } + public Explanation getResults(List> events) throws Exception { + return predictBatch(prepareBatch(events)); + } + } + + @Test + public void testGetSummaries() throws Exception { + List attributes = Arrays.asList("serverID", "region"); + Explainer e = new Explainer(attributes, event -> "error".equals(event.get("sev"))); + DataFrame df = e.prepareBatch(events); + assertEquals(1000,df.getNumRows()); + + Explanation s = e.predictBatch(df); + assertEquals(100, s.getNumOutliers()); + assertEquals(900, s.getNumInliers()); + List is = s.getItemsets(); + assertEquals(5, is.size()); + int numSingleton = 0; + for (AttributeSet itemResult : is) { + Map curItems = itemResult.getItems(); + if (curItems.size() == 1) { + numSingleton++; + } + } + assertEquals(3, numSingleton); + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/UnsupervisedCSVTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/UnsupervisedCSVTest.java new file mode 100644 index 000000000..56625fa43 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/UnsupervisedCSVTest.java @@ -0,0 +1,126 @@ +package edu.stanford.futuredata.macrobase; + +import edu.stanford.futuredata.macrobase.analysis.classify.PercentileClassifier; +import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer; +import edu.stanford.futuredata.macrobase.analysis.summary.Explanation; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameLoader; +import edu.stanford.futuredata.macrobase.ingest.DataFrameLoader; +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +/** + * This test looks at data with 1000 inliers and 20 outliers. + * The outliers have lower usage and all have + * location=CAN, version=v3 + */ +public class UnsupervisedCSVTest { + private DataFrame df; + + @Before + public void setUp() throws Exception { + Map schema = new HashMap<>(); + schema.put("usage", Schema.ColType.DOUBLE); + schema.put("latency", Schema.ColType.DOUBLE); + schema.put("location", Schema.ColType.STRING); + schema.put("version", Schema.ColType.STRING); + DataFrameLoader loader = new CSVDataFrameLoader( + "src/test/resources/sample.csv" + ).setColumnTypes(schema); + df = loader.load(); + } + + @Test + public void testGetSummaries() throws Exception { + PercentileClassifier pc = new PercentileClassifier("usage") + .setPercentile(1.0); + pc.process(df); + DataFrame df_classified = pc.getResults(); + + List explanationAttributes = Arrays.asList( + "location", + "version" + ); + BatchSummarizer summ = new BatchSummarizer() + .setAttributes(explanationAttributes); + summ.process(df_classified); + Explanation results = summ.getResults(); + assertEquals(3, results.getItemsets().size()); + } + + @Test + public void testCustomizedSummaries() throws Exception { + PercentileClassifier pc = new PercentileClassifier("usage") + .setPercentile(1.0); + pc.process(df); + DataFrame df_classified = pc.getResults(); + + List explanationAttributes = Arrays.asList( + "location", + "version" + ); + // Increase risk ratio + BatchSummarizer summ = new BatchSummarizer() + .setAttributes(explanationAttributes) + .setMinRiskRatio(5.0); + summ.process(df_classified); + Explanation results = summ.getResults(); + assertEquals(1, results.getItemsets().size()); + + // Increase support requirement + summ = new BatchSummarizer() + .setAttributes(explanationAttributes) + .setMinSupport(0.55); + summ.process(df_classified); + results = summ.getResults(); + assertEquals(2, results.getItemsets().size()); + + // Restrict to only simple explanations + summ = new BatchSummarizer() + .setAttributes(explanationAttributes) + .setUseAttributeCombinations(false); + summ.process(df_classified); + results = summ.getResults(); + assertEquals(2, results.getItemsets().size()); + + // Invert outlier classes + summ = new BatchSummarizer() + .setAttributes(explanationAttributes) + .setOutlierPredicate(d -> d == 0.0); + summ.process(df_classified); + results = summ.getResults(); + assertEquals(1000, results.getNumOutliers()); + assertEquals(0, results.getItemsets().size()); + } + + @Test + public void testCustomizedClassifier() throws Exception { + PercentileClassifier pc = new PercentileClassifier("usage") + .setPercentile(2.0) + .setIncludeHigh(false) + .setIncludeLow(true); + pc.process(df); + DataFrame df_classified = pc.getResults(); + + List explanationAttributes = Arrays.asList( + "location", + "version" + ); + BatchSummarizer summ = new BatchSummarizer() + .setAttributes(explanationAttributes) + .setUseAttributeCombinations(false); + summ.process(df_classified); + Explanation results = summ.getResults(); + assertEquals(2, results.getItemsets().size()); + assertTrue(results.getItemsets().get(0).getSupport() > .9); + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifierTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifierTest.java new file mode 100644 index 000000000..5fd07c9ee --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/PercentileClassifierTest.java @@ -0,0 +1,65 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import org.junit.Before; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class PercentileClassifierTest { + private DataFrame df; + + @Before + public void setUp() { + df = new DataFrame(); + double[] vals = new double[1000]; + for (int i = 0; i < vals.length; i++) { + vals[i] = i; + } + df.addDoubleColumn("val", vals); + } + + @Test + public void testClassify() throws Exception { + assertEquals(1000, df.getNumRows()); + PercentileClassifier pc = new PercentileClassifier("val"); + pc.process(df); + DataFrame output = pc.getResults(); + assertEquals(df.getNumRows(), output.getNumRows()); + assertEquals(1, df.getSchema().getNumColumns()); + assertEquals(2, output.getSchema().getNumColumns()); + + DataFrame outliers = output.filter( + pc.getOutputColumnName(), (double d) -> d != 0.0 + ); + int numOutliers = outliers.getNumRows(); + assertTrue(numOutliers >= 8 && numOutliers <= 12); + double[] vals = outliers.getDoubleColumnByName("val"); + for (double val : vals) { + assertTrue(val < 10 || val > 990); + } + } + + @Test + public void testConfigure() throws Exception { + PercentileClassifier pc = new PercentileClassifier("notcolumn"); + pc.setColumnName("val") + .setIncludeHigh(false) + .setIncludeLow(true) + .setOutputColumnName("_OUT") + .setPercentile(10); + + pc.process(df); + DataFrame output = pc.getResults(); + double lowCutoff = pc.getLowCutoff(); + assertTrue(lowCutoff > 90.0 && lowCutoff < 110.0); + assertEquals(df.getNumRows(), output.getNumRows()); + + DataFrame outliers = output.filter( + "_OUT", (double d) -> d != 0.0 + ); + int numOutliers = outliers.getNumRows(); + assertTrue(numOutliers >= 90 && numOutliers <= 110); + } +} \ No newline at end of file diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCountTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCountTest.java new file mode 100644 index 000000000..c481f22fc --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/count/ExactCountTest.java @@ -0,0 +1,30 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.count; + +import com.google.common.collect.Sets; +import org.junit.Test; + +import java.util.*; + +import static org.junit.Assert.assertEquals; + +public class ExactCountTest { + @Test + public void testCount() { + ExactCount ec = new ExactCount(); + HashMap truth = new HashMap<>(); + + List> its = new ArrayList<>(); + for (int i = 0; i < 100; ++i) { + for (int j = 0; j < i; ++j) { + its.add(Sets.newHashSet(i)); + truth.compute(i, (k, v) -> v == null ? 1 : v + 1); + } + } + + ec.count(its); + + for (Map.Entry cnt : ec.getCounts().entrySet()) { + assertEquals(truth.get(cnt.getKey()), cnt.getValue(), 1e-10); + } + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/Apriori.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/Apriori.java new file mode 100644 index 000000000..7ce3b50f5 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/Apriori.java @@ -0,0 +1,141 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetWithCount; + +import java.util.*; + +class Apriori { + private Set> genCandidates(List prevRound, + int desiredSize) { + Set> ret = new HashSet<>(); + for (int i = 0; i < prevRound.size(); ++i) { + for (int j = i + 1; j < prevRound.size(); ++j) { + Set combined = new HashSet<>(); + combined.addAll(prevRound.get(i).getItems()); + combined.addAll(prevRound.get(j).getItems()); + if (combined.size() == desiredSize) { + ret.add(combined); + } + } + } + + return ret; + } + + private List filterItems(List> transactions, + Set> candidates, + Set infrequentIndex, + int minSupportCount) { + List ret = new ArrayList<>(); + + HashMap, Integer> candidateCounts = new HashMap<>(); + + for (int i = 0; i < transactions.size(); ++i) { + if (infrequentIndex.contains(i)) { + continue; + } + + Set txn = transactions.get(i); + boolean foundSupportInTxn = false; + for (Set candidate : candidates) { + boolean allFound = true; + for (Integer candidateItem : candidate) { + if (!txn.contains(candidateItem)) { + allFound = false; + break; + } + } + + if (allFound) { + candidateCounts.compute(candidate, (k, v) -> v == null ? 1 : v + 1); + foundSupportInTxn = true; + } + } + + if (!foundSupportInTxn) { + infrequentIndex.add(i); + } + } + + for (Map.Entry, Integer> e : candidateCounts.entrySet()) { + if (e.getValue() >= minSupportCount) { + ret.add(new ItemsetWithCount(e.getKey(), e.getValue())); + } + } + return ret; + } + + public Set getItemsets(List> transactions, + Double support) { + Set ret = new HashSet<>(); + + int minSupportCount = (int) (support * transactions.size()); + + // first round candidates are all items; just count them + HashMap itemCounts = new HashMap<>(); + for (Set t : transactions) { + for (int i : t) { + itemCounts.compute(i, (k, v) -> v == null ? 1 : v + 1); + } + } + + for (Map.Entry e : itemCounts.entrySet()) { + if (e.getValue() >= minSupportCount) { + HashSet singletonSet = new HashSet<>(); + singletonSet.add(e.getKey()); + ret.add(new ItemsetWithCount(singletonSet, e.getValue())); + } + } + + if (ret.size() == 0) { + return ret; + } + + // second round, don't explicitly construct pairs + HashMap, Integer> pairCandidateCounts = new HashMap<>(); + + for (Set t : transactions) { + List txList = new ArrayList<>(t); + for (int i = 0; i < t.size(); ++i) { + for (int j = i + 1; j < t.size(); ++j) { + HashSet pairSet = new HashSet<>(); + pairSet.add(txList.get(i)); + pairSet.add(txList.get(j)); + pairCandidateCounts.compute(pairSet, (k, v) -> v == null ? 1 : v + 1); + } + } + } + + List pairItemsets = new ArrayList<>(); + + for (Map.Entry, Integer> e : pairCandidateCounts.entrySet()) { + if (e.getValue() >= minSupportCount) { + ItemsetWithCount ic = new ItemsetWithCount(e.getKey(), e.getValue()); + ret.add(ic); + pairItemsets.add(ic); + } + } + + if (pairItemsets.isEmpty()) { + return ret; + } + + List prevRoundItemsets = pairItemsets; + Set infrequentIndex = new HashSet<>(); + + int newSize = 3; + while (true) { + Set> candidates = genCandidates(prevRoundItemsets, newSize); + prevRoundItemsets = filterItems(transactions, + candidates, + infrequentIndex, + minSupportCount); + if (prevRoundItemsets.isEmpty()) { + return ret; + } else { + ret.addAll(prevRoundItemsets); + newSize += 1; + } + } + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoderTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoderTest.java new file mode 100644 index 000000000..1fc1f39d5 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/AttributeEncoderTest.java @@ -0,0 +1,46 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import org.junit.Test; + +import java.util.ArrayList; +import java.util.HashSet; +import java.util.List; +import java.util.Set; + +import static org.junit.Assert.assertEquals; + +public class AttributeEncoderTest { + private AttributeEncoder e = new AttributeEncoder(); + + @SuppressWarnings("unused") + private void printItemsets(List> results) { + for (Set itemset : results) { + System.out.println(itemset); + for (int i : itemset) { + System.out.println(e.decodeColumn(i) + ":" + e.decodeValue(i)); + } + } + } + + @Test + public void encodeColumns() throws Exception { + List columns = new ArrayList<>(); + for (int j = 0; j < 2; j ++) { + String[] curCol = new String[15]; + for (int i = 0; i < 15; i++) { + curCol[i] = String.valueOf(i % (j * 2 + 3)); + } + columns.add(curCol); + } + + List> results = e.encodeAttributes(columns); + assertEquals(results.size(), columns.get(0).length); + + Set totalItems = new HashSet<>(); + for (Set itemset : results) { + totalItems.addAll(itemset); + } + assertEquals(totalItems.size(), 5 + 3); + // printItemsets(results); + } +} \ No newline at end of file diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthTest.java new file mode 100644 index 000000000..26639ce9f --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/FPGrowthTest.java @@ -0,0 +1,171 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import edu.stanford.futuredata.macrobase.analysis.summary.itemset.result.ItemsetWithCount; +import org.junit.Test; + +import java.util.*; +import java.util.stream.Collectors; + +import static org.junit.Assert.assertEquals; + +public class FPGrowthTest { + private Set intIfy(String txnStr) { + return Arrays.stream(txnStr.split(", ")).map(s -> (int) s.charAt(0)).collect(Collectors.toSet()); + } + + @SuppressWarnings("unused") + private void printItemsets(List itemsets) { + itemsets.sort((a, b) -> b.getItems().size() - a.getItems().size()); + for (ItemsetWithCount i : itemsets) { + System.out.format("\ncount %f, size %d\n", i.getCount(), i.getItems().size()); + for (int item : i.getItems()) { + System.out.println((char) item); + } + } + } + + private boolean compareResults(Set ap_itemsets, List itemsets) { + for (int i = 0; i < itemsets.size(); i ++) { + boolean foundEquals = false; + for (ItemsetWithCount iwc : ap_itemsets) { + foundEquals |= iwc.equals(itemsets.get(i)); + } + if (!foundEquals) { return false; }; + } + return true; + } + + @Test + public void testFPFromPaper() { + List> txns = new ArrayList<>(); + txns.add(intIfy("f, a, c, d, g, i, m, p")); + txns.add(intIfy("a, b, c, f, l, m, o")); + txns.add(intIfy("b, f, h, j, o")); + txns.add(intIfy("b, c, k, s, p")); + txns.add(intIfy("a, f, c, e, l, p, m, n")); + + FPGrowth fp = new FPGrowth(); + Apriori ap = new Apriori(); + + Set ap_itemsets = ap.getItemsets(txns, .6); + List itemsets = fp.getItemsetsWithSupportRatio(txns, .6); + + assertEquals(18, itemsets.size()); + assert(compareResults(ap_itemsets, itemsets)); + } + + @Test + public void testFPLonger() { + + List> txns = new ArrayList<>(); + txns.add(intIfy("f, a, c, d, g, i, m, p")); + txns.add(intIfy("a, b, c, f, l, m, o")); + + FPGrowth fp = new FPGrowth(); + + List itemsets = fp.getItemsetsWithSupportRatio(txns, .2); + + Apriori ap = new Apriori(); + Set api = ap.getItemsets(txns, .2); + +// printItemsets(itemsets); + + List> apil = api.stream().map(i -> i.getItems()).collect(Collectors.toList()); + Set> dupdetector = new HashSet<>(); + + int numDup = 0; + for (Set s : apil) { + if (!dupdetector.add(s)) { + numDup++; + } + } + assertEquals(0, numDup); + + Set> iss = itemsets.stream().map(i -> i.getItems()).collect(Collectors.toSet()); + +// log.debug("DIFF: {}", Sets.difference(dupdetector, iss)); + + assertEquals(api.size(), itemsets.size()); + } + + @Test + public void simpleTest() { + List> txns = new ArrayList<>(); + txns.add(intIfy("a, b, c")); + txns.add(intIfy("a, b")); + txns.add(intIfy("a")); + + FPGrowth fp = new FPGrowth(); + Apriori ap = new Apriori(); + + Set ap_itemsets = ap.getItemsets(txns, .7); + List itemsets = fp.getItemsetsWithSupportRatio(txns, .7); + + //printItemsets(itemsets); + assertEquals(3, itemsets.size()); + assert(compareResults(ap_itemsets, itemsets)); + } + + @Test + public void dupTest() { + List> txns = new ArrayList<>(); + txns.add(intIfy("a, c, d")); + txns.add(intIfy("a, c, d, e")); + txns.add(intIfy("c")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("b")); + txns.add(intIfy("b")); + txns.add(intIfy("b")); + txns.add(intIfy("a, b, d")); + txns.add(intIfy("a, b, e, c")); + + FPGrowth fp = new FPGrowth(); + + List itemsets = fp.getItemsetsWithSupportCount(txns, null, .01 * txns.size(), true); + + List> apil = itemsets.stream().map(i -> i.getItems()).collect(Collectors.toList()); + Set> dupdetector = new HashSet<>(); + + int numDup = 0; + for (Set s : apil) { + if (!dupdetector.add(s)) { + numDup++; + } + } + assertEquals(0, numDup); + + //printItemsets(itemsets); + + assertEquals(dupdetector.size(), itemsets.size()); + } + + @Test + public void testGetSupport() { + List> txns = new ArrayList<>(); + txns.add(intIfy("a, c, d")); + txns.add(intIfy("a, c, d, e")); + txns.add(intIfy("c")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("a")); + txns.add(intIfy("b")); + txns.add(intIfy("b")); + txns.add(intIfy("b")); + txns.add(intIfy("a, b, d")); + txns.add(intIfy("a, b, e, c")); + + FPGrowth.FPTree fpt = new FPGrowth().constructTree(txns, 0); +// fpt.printTreeDebug(); + + assertEquals(2, fpt.getSupport(intIfy("a, b"))); + assertEquals(0, fpt.getSupport(intIfy("a, b, c, d"))); + + } +} + diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatioTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatioTest.java new file mode 100644 index 000000000..33f311d40 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/itemset/RiskRatioTest.java @@ -0,0 +1,35 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.itemset; + +import org.junit.Test; + +import static org.junit.Assert.assertEquals; + +public class RiskRatioTest { + + @Test + public void testRatio() { + assertEquals(1.0, RiskRatio.compute(10, 10, 100, 100), 0.01); + assertEquals(6., RiskRatio.compute(10, 10, 1000, 100), 0.01); + assertEquals(900.082, RiskRatio.compute(10, 99, 1000, 100), 0.01); + } + + @Test + public void testRatioBoundaryConditions() { + // no exposure + assertEquals(0, RiskRatio.compute(0, 0, 100, 100), 0); + + // all exposed + assertEquals(0, RiskRatio.compute(100, 100, 100, 100), 0); + + // event only found in exposed + assertEquals(Double.POSITIVE_INFINITY, RiskRatio.compute(0, 100, 100, 100), 0); + assertEquals(Double.POSITIVE_INFINITY, RiskRatio.compute(null, 100, 100, 100), 0); + + // event never found in exposed + assertEquals(0, RiskRatio.compute(100, 0, 1000, 100), 0); + assertEquals(0, RiskRatio.compute(100, null, 1000, 100), 0); + + // handling nulls, all zeroes + assertEquals(0, RiskRatio.compute(null, null, null, null), 0); + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/DataFrameTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/DataFrameTest.java new file mode 100644 index 000000000..985d78868 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/DataFrameTest.java @@ -0,0 +1,46 @@ +package edu.stanford.futuredata.macrobase.datamodel; + +import org.junit.Before; +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; + +public class DataFrameTest { + private DataFrame tinyDF; + + @Before + public void setUp() { + tinyDF = new DataFrame(); + double[] metric = {1.0, 2.0, 3.0}; + String[] attribute = {"a", "a", "b"}; + tinyDF.addDoubleColumn("metric", metric); + tinyDF.addStringColumn("attribute", attribute); + } + + @Test + public void testCreate() { + assertEquals(3, tinyDF.getNumRows()); + String[] attrColumn = tinyDF.getStringColumnByName("attribute"); + assertEquals(3, attrColumn.length); + assertEquals("a", attrColumn[0]); + Row curRow = tinyDF.getRow(0); + assertEquals(1.0, curRow.getAs("metric"), 1e-10); + } + + + @Test + public void testBulkOperations() { + DataFrame selected = tinyDF.selectByName(Arrays.asList("attribute")); + assertEquals(1, selected.getSchema().getNumColumns()); + DataFrame filtered = selected.filter( + "attribute", + (Object a) -> a.equals("a") + ); + assertEquals(2, filtered.getNumRows()); + + filtered = tinyDF.filter(1, (double d) -> d > 2.1); + assertEquals(1, filtered.getNumRows()); + } +} \ No newline at end of file diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/RowTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/RowTest.java new file mode 100644 index 000000000..b88962556 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/RowTest.java @@ -0,0 +1,40 @@ +package edu.stanford.futuredata.macrobase.datamodel; + +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class RowTest { + @Test + public void testSimple() { + Row row = new Row(Arrays.asList(5.0, "java")); + assertEquals(5.0, row.getAs(0), 1e-10); + assertEquals("java", row.getAs(1)); + assertTrue(row.toString().contains("5.0")); + + Schema schema = new Schema(); + schema.addColumn(Schema.ColType.DOUBLE, "metric"); + schema.addColumn(Schema.ColType.STRING, "attribute"); + row = new Row(schema, Arrays.asList(5.0, "java")); + assertEquals(5.0, row.getAs("metric"), 1e-10); + assertEquals("java", row.getAs("attribute")); + } + + @Test + public void testCompare() { + Row row1 = new Row(Arrays.asList(5.0, "java")); + + Schema schema = new Schema(); + schema.addColumn(Schema.ColType.DOUBLE, "metric"); + schema.addColumn(Schema.ColType.STRING, "attribute"); + Row row2 = new Row(schema, Arrays.asList(5.0, "java")); + + assertEquals(row1, row2); + assertEquals(row1.hashCode(), row2.hashCode()); + assertTrue(row2.toString().contains(row1.toString())); + } + +} \ No newline at end of file diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/SchemaTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/SchemaTest.java new file mode 100644 index 000000000..7e6005522 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/datamodel/SchemaTest.java @@ -0,0 +1,19 @@ +package edu.stanford.futuredata.macrobase.datamodel; + +import org.junit.Test; + +import static edu.stanford.futuredata.macrobase.datamodel.Schema.ColType; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class SchemaTest { + @Test + public void testSimple() throws Exception { + Schema s = new Schema(); + s.addColumn(ColType.DOUBLE, "usage"); + s.addColumn(ColType.STRING, "app_ver"); + assertTrue(s.getColumnIndex("app_ver") < 2); + assertEquals(2, s.getNumColumns()); + assertEquals(ColType.DOUBLE, s.getColumnType(s.getColumnIndex("usage"))); + } +} \ No newline at end of file diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/ingest/DataFrameCSVLoaderTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/ingest/DataFrameCSVLoaderTest.java new file mode 100644 index 000000000..251b5e1ea --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/ingest/DataFrameCSVLoaderTest.java @@ -0,0 +1,31 @@ +package edu.stanford.futuredata.macrobase.ingest; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Row; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import org.junit.Test; + +import java.util.HashMap; +import java.util.Map; + +import static org.junit.Assert.assertEquals; + +public class DataFrameCSVLoaderTest { + @Test + public void testLoadSimple() throws Exception { + Map colTypes = new HashMap<>(); + colTypes.put("usage", Schema.ColType.DOUBLE); + + DataFrameLoader loader = new CSVDataFrameLoader("src/test/resources/tiny.csv") + .setColumnTypes(colTypes); + DataFrame df = loader.load(); + + assertEquals(3, df.getNumRows()); + assertEquals(3, df.getSchema().getNumColumns()); + double[] usage = df.getDoubleColumnByName("usage"); + assertEquals(usage[0], 2.0, 1e-10); + + Row row = df.getRow(1); + assertEquals("CAN", row.getAs("location")); + } +} \ No newline at end of file diff --git a/lib/src/test/resources/sample.csv b/lib/src/test/resources/sample.csv new file mode 100644 index 000000000..57c2b6a0a --- /dev/null +++ b/lib/src/test/resources/sample.csv @@ -0,0 +1,1021 @@ +usage,latency,location,version +30.77,238,CAN,v2 +31.28,611,CAN,v2 +31.17,768,RUS,v4 +30.94,192,AUS,v3 +35.36,401,UK,v3 +39.12,531,RUS,v4 +33.9,223,UK,v3 +40.09,582,USA,v1 +2.897,391,CAN,v3 +39.03,441,CAN,v2 +32.2,430,CAN,v1 +42.3,572,RUS,v4 +33.64,541,AUS,v4 +38.54,88,USA,v1 +25.46,918,UK,v3 +35.69,931,AUS,v3 +29.21,369,CAN,v2 +25.83,627,RUS,v4 +39.32,609,USA,v1 +41.52,39,CAN,v2 +35.41,140,AUS,v3 +32.42,166,AUS,v3 +29.84,351,AUS,v3 +46.88,836,AUS,v3 +38.59,790,AUS,v3 +38.61,548,UK,v2 +31.18,546,RUS,v4 +29.31,955,AUS,v3 +28.82,744,UK,v2 +36.5,406,USA,v1 +41.19,80,AUS,v3 +38.68,102,CAN,v2 +40.11,148,RUS,v4 +42.31,138,AUS,v3 +37.55,967,RUS,v4 +35.69,600,USA,v1 +42.1,344,AUS,v3 +30.42,890,UK,v3 +35.78,198,AUS,v4 +31.81,20,CAN,v2 +36.35,584,USA,v1 +41,699,UK,v2 +34.88,800,CAN,v1 +35.31,996,RUS,v4 +29.78,905,RUS,v4 +33.68,647,AUS,v3 +40.16,279,RUS,v4 +32.04,254,RUS,v4 +35.39,328,RUS,v4 +32.14,261,USA,v1 +28.21,979,RUS,v4 +10.56,661,CAN,v3 +43.79,887,USA,v1 +40.98,793,AUS,v4 +32.15,789,USA,v1 +36.3,588,CAN,v2 +34.76,825,CAN,v2 +36.07,204,UK,v3 +39.48,102,UK,v3 +8.187,354,CAN,v3 +38.07,953,AUS,v3 +32.77,329,UK,v3 +33.68,465,UK,v2 +39.5,864,RUS,v4 +32.28,379,CAN,v2 +33.29,932,UK,v3 +29.8,874,USA,v1 +31.13,437,CAN,v1 +36.8,454,RUS,v4 +41.34,706,UK,v2 +36.51,498,CAN,v2 +40.4,272,RUS,v4 +36.79,740,CAN,v2 +32.98,318,CAN,v2 +32.24,911,AUS,v3 +24.32,645,UK,v3 +39.32,123,RUS,v4 +29.78,38,UK,v2 +36.03,635,USA,v1 +39.9,84,USA,v1 +38.7,250,AUS,v3 +30.42,233,USA,v1 +32.11,361,CAN,v2 +30.3,291,RUS,v4 +46.69,209,AUS,v3 +41.92,401,CAN,v1 +32.01,251,RUS,v4 +31.66,924,RUS,v4 +31.45,882,RUS,v4 +28.73,522,UK,v2 +38.47,806,CAN,v2 +37.89,778,UK,v2 +43.04,152,UK,v3 +33.36,245,CAN,v1 +32.07,675,RUS,v4 +25.95,250,AUS,v4 +7.931,279,CAN,v3 +36.71,724,CAN,v1 +34.63,553,RUS,v4 +40.6,494,USA,v1 +39.31,8,UK,v3 +34.92,161,CAN,v2 +45.56,961,AUS,v3 +35.83,948,RUS,v4 +37.67,816,UK,v2 +40.2,198,RUS,v4 +38.48,907,RUS,v4 +34.49,345,AUS,v3 +34.49,453,USA,v1 +27.07,892,AUS,v4 +35.31,871,CAN,v2 +34.08,13,CAN,v1 +35.04,476,USA,v1 +39.42,925,USA,v1 +37.45,243,AUS,v4 +38,208,UK,v2 +34.52,418,RUS,v4 +33.88,532,AUS,v3 +36.6,921,CAN,v1 +35.06,156,CAN,v2 +32.44,164,CAN,v1 +32.9,120,RUS,v4 +33.39,367,AUS,v4 +31.24,304,AUS,v4 +31.44,78,USA,v1 +36.15,639,AUS,v3 +31.42,636,CAN,v2 +38.11,682,USA,v1 +28.97,923,UK,v3 +45.73,745,UK,v3 +28.17,876,USA,v1 +37.74,308,CAN,v1 +2.745,926,CAN,v3 +41.21,196,RUS,v4 +32.35,258,UK,v3 +29.31,263,USA,v1 +36.37,116,AUS,v3 +43.04,528,CAN,v1 +33.92,932,AUS,v4 +39.12,694,AUS,v4 +41.14,718,RUS,v4 +30.69,292,USA,v1 +33.59,628,CAN,v2 +31.34,190,USA,v1 +39.46,144,AUS,v3 +40.46,143,UK,v2 +38.39,689,AUS,v3 +21.48,25,UK,v2 +35.07,25,CAN,v2 +24.56,526,USA,v1 +37.92,931,AUS,v3 +34.2,409,RUS,v4 +31.83,217,UK,v3 +28.09,640,USA,v1 +37.35,247,RUS,v4 +38.68,49,AUS,v3 +41.47,525,UK,v2 +39.52,181,UK,v2 +41.57,161,CAN,v2 +43.66,671,AUS,v3 +37.88,187,AUS,v3 +32.74,764,UK,v3 +35.76,763,CAN,v1 +35.54,416,AUS,v4 +39.08,961,USA,v1 +32.35,391,CAN,v2 +35.89,544,UK,v3 +10.2,54,CAN,v3 +41.31,736,AUS,v3 +39.42,354,USA,v1 +34.43,837,UK,v2 +39.24,278,AUS,v3 +33.98,572,UK,v2 +25.9,722,CAN,v2 +39.46,732,USA,v1 +19.75,548,USA,v1 +40.48,875,RUS,v4 +33,324,AUS,v3 +37.35,237,AUS,v4 +38.04,118,RUS,v4 +31.81,566,CAN,v2 +31.15,148,USA,v1 +40.54,26,UK,v3 +28.85,611,USA,v1 +41.1,939,UK,v3 +33.14,881,USA,v1 +34.61,319,RUS,v4 +31.57,250,UK,v2 +43.54,706,AUS,v3 +30.34,905,USA,v1 +32.98,543,AUS,v4 +17.25,953,CAN,v2 +30.21,675,USA,v1 +37.42,902,CAN,v2 +37.96,146,CAN,v2 +35.28,830,UK,v3 +30.9,290,CAN,v2 +31.95,401,UK,v2 +35.29,353,UK,v3 +27.52,710,USA,v1 +32.7,497,USA,v1 +36.02,844,RUS,v4 +38.03,212,RUS,v4 +38.03,894,RUS,v4 +42.23,491,RUS,v4 +31.27,879,AUS,v3 +35.07,664,USA,v1 +35.76,717,CAN,v2 +34.98,92,AUS,v4 +45.36,635,RUS,v4 +22.78,558,USA,v1 +34.17,321,USA,v1 +35.29,393,USA,v1 +34.93,154,CAN,v2 +44.51,312,CAN,v1 +35.22,585,UK,v2 +32.87,782,USA,v1 +37,921,CAN,v2 +36.43,608,RUS,v4 +41.3,982,RUS,v4 +26.95,831,UK,v3 +34.99,541,RUS,v4 +42.74,104,AUS,v3 +36.67,423,UK,v2 +45.34,202,AUS,v3 +38.66,971,RUS,v4 +36.33,533,USA,v1 +23.46,420,UK,v3 +37.89,487,CAN,v1 +30.01,168,UK,v2 +42.59,919,USA,v1 +28.59,561,RUS,v4 +32.86,546,UK,v2 +36.74,702,RUS,v4 +32.93,952,USA,v1 +43.78,43,UK,v2 +40.4,395,CAN,v2 +33.19,168,CAN,v2 +27.82,730,RUS,v4 +39.03,571,RUS,v4 +27.37,804,AUS,v3 +28.49,884,USA,v1 +35.17,46,USA,v1 +27.91,41,CAN,v2 +34.55,668,USA,v1 +32.36,655,RUS,v4 +39.18,472,AUS,v3 +45.89,606,RUS,v4 +23.15,864,USA,v1 +35.64,791,USA,v1 +34.55,764,UK,v2 +37.49,13,AUS,v3 +38.64,436,AUS,v4 +32.74,91,UK,v3 +42.18,654,RUS,v4 +40.79,240,USA,v1 +31.09,252,UK,v3 +38.45,893,USA,v1 +32.36,27,USA,v1 +36.58,732,USA,v1 +36.89,685,UK,v3 +33.71,988,CAN,v2 +32.06,522,CAN,v2 +27.7,390,RUS,v4 +40.53,909,UK,v3 +34.67,450,USA,v1 +15.5,470,CAN,v3 +45.03,525,RUS,v4 +35.94,408,USA,v1 +37.55,917,USA,v1 +29.84,610,RUS,v4 +44.28,648,CAN,v1 +36.67,849,CAN,v2 +29.57,247,CAN,v1 +33.46,577,USA,v1 +28.97,553,AUS,v4 +42.51,179,USA,v1 +34.57,375,USA,v1 +30.68,211,CAN,v2 +18.75,778,UK,v3 +39.23,274,AUS,v3 +28.63,958,USA,v1 +37.6,813,USA,v1 +31.53,774,AUS,v3 +17.24,488,USA,v1 +39.28,968,USA,v1 +41.58,420,UK,v3 +33.5,995,AUS,v4 +37.57,517,AUS,v3 +45.18,221,UK,v2 +42.1,690,UK,v2 +31.25,999,UK,v3 +39.55,619,RUS,v4 +34.44,543,USA,v1 +40.19,890,RUS,v4 +35.42,815,RUS,v4 +36.9,170,CAN,v2 +33.2,888,RUS,v4 +32.09,563,USA,v1 +33.49,981,UK,v2 +40.52,484,CAN,v2 +43.23,8,USA,v1 +23.58,336,AUS,v4 +34.89,237,UK,v2 +40.79,150,RUS,v4 +28.13,468,RUS,v4 +36.05,141,UK,v2 +38.55,848,UK,v2 +34.58,769,RUS,v4 +37.96,440,AUS,v3 +37.89,315,RUS,v4 +36.05,653,CAN,v2 +47.86,680,RUS,v4 +37.98,689,AUS,v3 +37.42,314,CAN,v1 +35.33,38,CAN,v1 +28.86,408,AUS,v3 +33.86,3,AUS,v3 +38.43,533,UK,v2 +35.87,96,UK,v2 +37.46,785,UK,v3 +10.44,453,CAN,v3 +36.26,681,AUS,v3 +34.35,149,UK,v3 +42.33,755,USA,v1 +41.37,201,RUS,v4 +41.48,539,AUS,v3 +36.5,304,AUS,v3 +39.1,259,CAN,v1 +-0.335,729,CAN,v3 +49.22,464,RUS,v4 +50.04,735,RUS,v4 +26.31,582,CAN,v1 +32.73,561,UK,v2 +39.01,567,AUS,v3 +34.1,855,CAN,v2 +26.11,856,UK,v3 +39.03,380,CAN,v2 +32.7,850,USA,v1 +30.34,879,USA,v1 +32.41,74,USA,v1 +35.75,583,UK,v2 +29.23,663,USA,v1 +35.28,866,UK,v2 +42.02,514,UK,v2 +33.84,147,UK,v2 +34.27,549,AUS,v3 +38.32,168,UK,v3 +40.92,838,RUS,v4 +18.85,427,CAN,v3 +38.83,623,RUS,v4 +35.66,903,AUS,v3 +27.11,713,CAN,v2 +29.53,664,USA,v1 +35.63,751,UK,v2 +36,571,UK,v3 +33.85,27,RUS,v4 +33.08,3,AUS,v3 +42.53,462,UK,v3 +27.03,870,UK,v3 +44.08,737,UK,v2 +35.12,320,AUS,v3 +37.19,668,USA,v1 +44.71,610,AUS,v3 +26.76,345,RUS,v4 +30.96,725,CAN,v2 +44.81,943,UK,v3 +35.33,850,USA,v1 +31.13,438,RUS,v4 +34.3,525,USA,v1 +29.8,594,CAN,v2 +30.67,95,AUS,v4 +33.04,39,AUS,v4 +25.02,234,UK,v2 +32.96,794,UK,v3 +40.8,455,RUS,v4 +38.66,73,AUS,v3 +36.7,376,USA,v1 +39.98,27,RUS,v4 +18.54,604,CAN,v3 +36.47,542,CAN,v1 +39.54,856,AUS,v3 +40.54,685,UK,v3 +35.21,481,AUS,v3 +37.35,877,AUS,v3 +36.75,308,UK,v3 +37.09,639,AUS,v4 +37.34,691,RUS,v4 +32.87,232,UK,v2 +33.45,188,CAN,v2 +38.84,302,UK,v3 +36.76,307,USA,v1 +36.97,865,UK,v2 +32.5,894,USA,v1 +37.33,340,RUS,v4 +28.98,454,UK,v2 +44.99,915,USA,v1 +28.18,588,RUS,v4 +39.26,817,AUS,v3 +37.95,589,RUS,v4 +38.45,389,CAN,v2 +37.84,371,CAN,v2 +33.66,745,AUS,v4 +33.02,875,AUS,v3 +39.6,189,USA,v1 +41.51,3,USA,v1 +34.32,345,CAN,v2 +36.48,706,USA,v1 +32.21,528,CAN,v1 +29.76,68,AUS,v3 +32.3,270,AUS,v3 +22.74,102,AUS,v4 +38.9,294,UK,v2 +3.703,392,CAN,v3 +29.29,367,USA,v1 +40.23,539,UK,v2 +35.64,905,CAN,v1 +30.23,846,UK,v2 +37.3,332,UK,v3 +33.36,657,AUS,v3 +36.61,636,USA,v1 +35.36,891,AUS,v3 +38.96,702,USA,v1 +39.14,723,USA,v1 +28.83,159,RUS,v4 +30.67,419,AUS,v3 +44.18,283,RUS,v4 +43.59,900,CAN,v2 +31.63,920,RUS,v4 +31.98,81,RUS,v4 +31.62,632,UK,v2 +36.87,271,CAN,v2 +43.18,482,USA,v1 +41.3,82,RUS,v4 +45.08,747,UK,v3 +37.89,553,USA,v1 +28.11,655,UK,v3 +33.77,348,UK,v2 +40.6,574,USA,v1 +37.09,173,RUS,v4 +29.13,239,RUS,v4 +35.07,224,AUS,v3 +41.33,622,UK,v2 +35.17,867,USA,v1 +35.61,175,CAN,v1 +35.44,968,UK,v3 +33.83,898,CAN,v1 +32.36,119,CAN,v1 +29.69,608,UK,v3 +22.81,136,USA,v1 +39.85,162,UK,v2 +33.12,113,USA,v1 +41.27,167,UK,v2 +31.76,397,USA,v1 +30.25,723,AUS,v3 +26.59,656,CAN,v2 +36.88,656,CAN,v2 +32.72,799,USA,v1 +20.34,24,CAN,v2 +35.62,334,CAN,v2 +37.34,56,AUS,v3 +36.61,364,CAN,v2 +32.11,507,CAN,v1 +46.61,684,AUS,v4 +33.5,700,AUS,v3 +31.18,395,UK,v3 +40.9,943,AUS,v3 +29.65,998,UK,v3 +29.23,714,CAN,v2 +18.72,963,CAN,v1 +32.83,933,UK,v2 +32.93,239,CAN,v2 +38.29,563,CAN,v2 +37.13,262,AUS,v3 +26.56,557,USA,v1 +30.63,660,UK,v3 +34.8,635,UK,v2 +38.14,875,CAN,v2 +36.36,836,UK,v2 +36.97,346,USA,v1 +32.94,883,RUS,v4 +26.78,478,USA,v1 +26.28,168,AUS,v3 +44.68,125,CAN,v2 +32.89,344,AUS,v4 +40.26,654,CAN,v2 +36.25,656,AUS,v3 +37.08,983,CAN,v2 +29.42,785,RUS,v4 +27.72,507,USA,v1 +32.53,680,USA,v1 +28.51,756,RUS,v4 +39.49,979,USA,v1 +42.28,107,CAN,v2 +38.78,392,USA,v1 +39.64,480,CAN,v2 +37.62,544,AUS,v3 +41.93,19,UK,v3 +41.29,46,AUS,v3 +41.83,163,CAN,v2 +34.65,861,UK,v2 +41.7,515,RUS,v4 +29.96,282,RUS,v4 +32.93,614,USA,v1 +37.53,928,USA,v1 +24.69,234,AUS,v4 +36.05,580,USA,v1 +39.25,161,CAN,v1 +34.11,789,AUS,v3 +36.36,886,CAN,v2 +38.06,695,AUS,v4 +36.64,626,AUS,v3 +27.98,534,USA,v1 +34.96,755,CAN,v2 +35.72,744,RUS,v4 +29.9,440,CAN,v2 +29.99,647,RUS,v4 +39.3,594,CAN,v1 +38.34,575,AUS,v3 +34.8,946,CAN,v2 +28.81,113,USA,v1 +32.16,81,USA,v1 +34.89,910,USA,v1 +32.99,998,RUS,v4 +28.98,161,RUS,v4 +33.69,803,CAN,v2 +37.38,481,CAN,v2 +38.07,773,AUS,v3 +36.37,726,RUS,v4 +37.94,806,CAN,v2 +33.05,808,RUS,v4 +32.51,515,RUS,v4 +28.72,798,UK,v3 +35.52,491,UK,v2 +38.38,215,RUS,v4 +33.93,855,CAN,v2 +30.28,793,AUS,v3 +28.71,534,AUS,v3 +38.35,540,CAN,v2 +46.33,838,RUS,v4 +39.41,96,UK,v3 +32.11,280,UK,v3 +35.37,192,CAN,v2 +32.6,572,UK,v2 +34.14,103,AUS,v3 +33.47,549,CAN,v2 +30.83,346,AUS,v4 +29.72,494,USA,v1 +30.43,719,CAN,v1 +35.21,462,RUS,v4 +37.79,585,UK,v2 +35.26,937,CAN,v2 +34.75,699,USA,v1 +35.67,592,CAN,v2 +31.03,221,USA,v1 +42.85,346,UK,v3 +7.104,80,CAN,v3 +25.59,788,CAN,v1 +35.45,147,AUS,v4 +37.3,737,AUS,v3 +41.19,740,USA,v1 +38.26,433,UK,v3 +29.97,627,AUS,v4 +46.54,429,AUS,v4 +30.85,589,UK,v3 +33.44,361,USA,v1 +41.29,110,USA,v1 +35.93,471,CAN,v1 +31.17,962,CAN,v2 +40.01,877,UK,v2 +25.55,597,USA,v1 +29.75,28,UK,v3 +40.3,625,AUS,v4 +43.52,531,USA,v1 +32.01,124,USA,v1 +35.33,67,CAN,v2 +31.9,71,UK,v2 +28.64,436,USA,v1 +31.8,259,RUS,v4 +34.58,537,AUS,v4 +31.33,506,AUS,v3 +41.15,946,USA,v1 +36.54,737,CAN,v2 +35.14,270,UK,v2 +39.35,218,RUS,v4 +35.08,245,AUS,v3 +30.57,789,USA,v1 +31.25,150,USA,v1 +36.08,257,CAN,v2 +36.08,774,UK,v3 +36.75,179,UK,v3 +30.62,657,UK,v3 +40.81,852,USA,v1 +31.02,430,RUS,v4 +35.22,645,AUS,v3 +35.09,295,UK,v2 +36.93,643,UK,v2 +37.7,244,CAN,v2 +30.01,713,RUS,v4 +33.03,921,USA,v1 +45.22,720,USA,v1 +30.33,546,UK,v3 +32.6,640,USA,v1 +36.85,639,UK,v2 +34.73,643,CAN,v2 +35.17,513,RUS,v4 +35.36,313,UK,v3 +24.03,567,USA,v1 +33.71,463,UK,v3 +37.08,136,USA,v1 +29.56,756,RUS,v4 +31.19,280,AUS,v3 +38.53,65,CAN,v1 +27.91,220,RUS,v4 +34.06,807,USA,v1 +26.32,121,CAN,v1 +31.34,910,UK,v3 +12.6,524,CAN,v3 +34.43,594,AUS,v3 +33.24,277,AUS,v3 +38.99,485,USA,v1 +37.51,223,RUS,v4 +39.73,7,AUS,v3 +31.61,102,CAN,v2 +31.12,446,RUS,v4 +28.29,829,CAN,v2 +27.41,777,USA,v1 +32.02,14,USA,v1 +23.9,982,USA,v1 +37.57,239,RUS,v4 +40.54,300,RUS,v4 +30.39,549,CAN,v1 +35.34,312,AUS,v3 +31.18,982,AUS,v3 +37.07,764,AUS,v4 +33.75,375,AUS,v3 +31.69,517,AUS,v3 +43.2,122,RUS,v4 +39.18,313,RUS,v4 +40.27,971,CAN,v1 +44.25,871,RUS,v4 +30.33,377,RUS,v4 +44.73,997,RUS,v4 +37.74,848,USA,v1 +33.18,881,RUS,v4 +36.29,768,USA,v1 +30.04,86,CAN,v1 +29.44,124,UK,v3 +38.15,430,RUS,v4 +37.05,960,UK,v3 +29.52,196,AUS,v3 +32.36,141,RUS,v4 +35.63,189,CAN,v2 +43.15,848,RUS,v4 +47.17,317,CAN,v2 +42.37,813,RUS,v4 +0.9782,809,CAN,v3 +31.24,510,AUS,v3 +35.72,178,CAN,v2 +30.72,151,AUS,v3 +39.9,208,UK,v3 +39.81,188,RUS,v4 +32.46,537,CAN,v2 +36.04,320,UK,v2 +34.71,742,USA,v1 +32.92,998,UK,v2 +33.48,652,USA,v1 +34.26,106,UK,v2 +31.53,761,USA,v1 +26.81,377,USA,v1 +42.38,630,CAN,v1 +35.96,108,UK,v3 +26.69,699,CAN,v1 +41.35,67,UK,v2 +33.21,796,AUS,v4 +41.3,523,USA,v1 +35.31,406,UK,v2 +33.04,401,AUS,v3 +28.73,61,AUS,v3 +38.88,200,RUS,v4 +35.26,195,CAN,v2 +33.46,135,AUS,v3 +40.29,476,USA,v1 +31.84,178,RUS,v4 +38.45,320,RUS,v4 +40.96,115,RUS,v4 +46.4,787,AUS,v3 +43.37,125,CAN,v2 +30.92,751,AUS,v4 +47.89,642,AUS,v3 +34.34,386,AUS,v3 +44.67,539,RUS,v4 +34.05,752,AUS,v3 +40.36,549,RUS,v4 +36.6,638,AUS,v4 +33.23,393,AUS,v3 +27.97,581,UK,v2 +32.92,478,USA,v1 +36.9,742,RUS,v4 +33.61,422,AUS,v3 +25.34,228,UK,v3 +29.86,658,RUS,v4 +40.78,770,CAN,v2 +39.52,512,RUS,v4 +33.67,292,USA,v1 +23.99,468,UK,v3 +45.69,505,UK,v2 +30.27,508,RUS,v4 +32.21,373,USA,v1 +40.09,360,CAN,v2 +46.49,976,UK,v2 +32.69,86,RUS,v4 +29.12,69,UK,v3 +39.87,946,UK,v3 +36.95,465,UK,v2 +32.18,304,RUS,v4 +35.74,936,CAN,v2 +38.01,227,USA,v1 +34.21,86,RUS,v4 +32.62,981,UK,v2 +30.92,385,AUS,v4 +36.39,873,RUS,v4 +39.71,84,CAN,v2 +31.89,904,AUS,v3 +36.05,748,AUS,v3 +30.98,630,AUS,v3 +41.47,745,UK,v2 +31.29,673,UK,v2 +29.02,110,USA,v1 +31.59,650,CAN,v2 +28.13,607,RUS,v4 +42.24,435,USA,v1 +36.43,282,CAN,v2 +33.62,171,CAN,v2 +33.58,268,RUS,v4 +27.26,91,AUS,v3 +30.99,622,CAN,v2 +38.2,645,RUS,v4 +31.56,647,RUS,v4 +32.93,246,RUS,v4 +40.68,863,USA,v1 +44.27,197,USA,v1 +32.94,832,USA,v1 +37.29,507,RUS,v4 +31.17,570,CAN,v2 +28.53,497,RUS,v4 +33.75,194,CAN,v2 +8.413,489,CAN,v3 +45.43,63,USA,v1 +35.49,314,UK,v3 +32,216,RUS,v4 +31.13,712,UK,v2 +35.14,414,UK,v3 +36.25,506,RUS,v4 +36.31,53,RUS,v4 +34.86,337,AUS,v3 +35.41,899,AUS,v4 +33.82,325,UK,v2 +41.47,60,RUS,v4 +36.78,851,USA,v1 +25.05,817,CAN,v2 +32.92,541,AUS,v4 +35.4,380,AUS,v3 +28.58,150,AUS,v4 +30.45,266,CAN,v2 +31.31,545,AUS,v3 +42.11,234,USA,v1 +32.62,971,CAN,v2 +33.73,501,USA,v1 +34.71,678,AUS,v3 +38.7,102,AUS,v3 +32.84,155,UK,v2 +32.24,909,UK,v2 +32.48,357,CAN,v2 +36.83,567,CAN,v2 +26.29,142,CAN,v1 +29.71,112,AUS,v3 +34.55,111,RUS,v4 +33.66,672,CAN,v2 +38.11,110,UK,v2 +43.57,902,USA,v1 +34.35,670,CAN,v2 +46.82,514,RUS,v4 +37.52,625,RUS,v4 +32.22,602,RUS,v4 +37.24,566,CAN,v2 +34.54,439,RUS,v4 +35.07,406,AUS,v3 +35.29,672,AUS,v4 +28.71,55,UK,v3 +36.34,659,USA,v1 +34.52,756,USA,v1 +39.01,164,RUS,v4 +38.44,704,USA,v1 +30.89,28,RUS,v4 +40.47,405,AUS,v3 +25.95,787,USA,v1 +30.93,444,CAN,v2 +37.11,754,AUS,v3 +40.18,216,USA,v1 +39.43,444,RUS,v4 +32.82,765,UK,v2 +35.12,871,UK,v3 +33.94,366,USA,v1 +30.6,332,UK,v2 +27.51,615,AUS,v3 +32.29,125,UK,v2 +36.71,74,RUS,v4 +36.52,984,CAN,v2 +28.13,464,USA,v1 +32.24,430,CAN,v2 +30.95,9,UK,v3 +33.8,996,USA,v1 +33.73,190,USA,v1 +26.14,861,UK,v3 +36.9,619,UK,v3 +38.3,245,CAN,v2 +37.21,890,CAN,v2 +37.5,549,USA,v1 +32.21,229,RUS,v4 +34.68,415,RUS,v4 +36.59,373,USA,v1 +34.97,110,RUS,v4 +42.41,603,UK,v3 +44.69,557,RUS,v4 +31.37,434,AUS,v4 +40.06,116,CAN,v2 +38.82,400,USA,v1 +34.73,666,CAN,v1 +35.77,546,CAN,v1 +34.76,723,RUS,v4 +40.81,884,USA,v1 +42.37,670,AUS,v3 +31.97,342,USA,v1 +40.76,525,AUS,v4 +34.17,797,AUS,v3 +37.21,282,UK,v2 +30.62,929,AUS,v3 +33.97,976,USA,v1 +41.39,297,UK,v3 +37.2,564,USA,v1 +31.28,963,USA,v1 +3.794,991,CAN,v3 +30.49,727,CAN,v2 +39.95,545,AUS,v4 +36.59,67,UK,v2 +30.83,48,AUS,v3 +38.45,223,USA,v1 +36.45,928,CAN,v2 +37.39,562,RUS,v4 +40.23,817,UK,v3 +33.43,7,CAN,v2 +37.43,296,AUS,v3 +38.6,305,RUS,v4 +28.13,319,RUS,v4 +39.75,639,RUS,v4 +31.05,716,UK,v2 +34.19,361,CAN,v1 +44,130,UK,v2 +43.23,198,AUS,v3 +36.64,45,RUS,v4 +29.31,900,CAN,v2 +40.86,66,CAN,v2 +36.51,534,AUS,v3 +31.01,857,RUS,v4 +29.45,321,RUS,v4 +38.19,295,CAN,v2 +30.56,379,RUS,v4 +40.05,949,UK,v3 +34.21,792,RUS,v4 +36.38,237,UK,v2 +27.86,63,CAN,v1 +26.67,91,RUS,v4 +37.84,733,USA,v1 +40.96,198,CAN,v2 +37.66,698,RUS,v4 +37.79,547,USA,v1 +35.15,6,CAN,v2 +32.26,672,AUS,v3 +31.97,527,USA,v1 +34.03,56,RUS,v4 +28.88,664,UK,v3 +24.14,314,UK,v3 +31.92,422,UK,v2 +4.815,635,CAN,v3 +33.56,941,CAN,v2 +34.88,955,AUS,v3 +37.76,761,UK,v3 +33.27,788,UK,v2 +31.17,731,RUS,v4 +27.81,39,RUS,v4 +43.63,771,CAN,v2 +38.98,332,AUS,v3 +28.98,48,AUS,v3 +41.24,921,UK,v2 +34.35,785,USA,v1 +33.16,450,RUS,v4 +34.51,233,AUS,v4 +33.96,775,AUS,v4 +30.58,908,AUS,v4 +29,465,AUS,v3 +30.72,483,RUS,v4 +43.39,545,AUS,v3 +35.11,559,USA,v1 +41.19,511,CAN,v2 +31.12,398,UK,v3 +31.08,750,CAN,v2 +29.28,768,UK,v3 +30.16,531,USA,v1 +41.04,122,AUS,v3 +24.75,290,CAN,v2 +34.82,973,CAN,v2 +30.26,567,AUS,v3 +32.45,174,RUS,v4 +43.99,833,RUS,v4 +33.11,189,USA,v1 +37.63,903,UK,v3 +29.24,272,RUS,v4 +31.41,439,CAN,v1 +31.67,134,CAN,v1 +31.66,711,USA,v1 +38.71,870,CAN,v2 +42.59,848,RUS,v4 +32.74,668,AUS,v3 +29.58,952,RUS,v4 +33.36,493,CAN,v2 +29.83,443,UK,v3 +32.95,414,RUS,v4 +36.15,16,UK,v3 +29.93,872,USA,v1 +32.34,155,CAN,v2 +30.44,788,UK,v2 +37.78,196,USA,v1 +36.5,400,USA,v1 +31.84,702,RUS,v4 +37.73,80,AUS,v3 +37.41,87,UK,v2 +37.53,820,USA,v1 +29.63,914,USA,v1 +35.65,613,RUS,v4 +36.52,792,RUS,v4 +35.46,953,USA,v1 +28.08,651,RUS,v4 +36.33,346,RUS,v4 +29.57,921,USA,v1 +39.43,864,UK,v3 +29.35,401,AUS,v3 +29.71,236,UK,v2 +37.16,223,UK,v2 +30.08,379,RUS,v4 +34.92,319,CAN,v2 +32.97,905,USA,v1 +37.57,442,CAN,v2 +40.32,522,UK,v2 +33.13,444,CAN,v2 +33.6,257,USA,v1 +37.43,60,AUS,v3 +34.64,345,UK,v3 +35.52,449,CAN,v2 +40.95,244,USA,v1 +36.78,766,AUS,v3 +38.78,765,AUS,v3 +27.63,880,UK,v3 +30.82,707,UK,v2 +38.49,707,UK,v3 +42.34,806,RUS,v4 +29.3,221,RUS,v4 +27.02,561,USA,v1 +28.68,865,RUS,v4 +6.395,470,CAN,v3 +33.3,533,CAN,v2 +11.94,255,CAN,v3 +39.17,206,AUS,v3 +37.37,754,UK,v3 +38.65,501,USA,v1 +24.02,84,CAN,v2 +34.22,302,RUS,v4 +32.55,206,CAN,v2 +29.63,494,USA,v1 +28.42,585,CAN,v2 +41.77,227,CAN,v2 +28.1,893,RUS,v4 +37.86,594,UK,v2 +41.11,594,CAN,v2 +34.56,585,AUS,v4 +42.94,601,RUS,v4 +40.76,602,UK,v3 +36.73,299,CAN,v2 +40.35,999,RUS,v4 +29.14,254,CAN,v1 +36.38,102,USA,v1 +37.61,198,RUS,v4 +30.6,915,AUS,v3 +35.62,551,AUS,v3 +38.21,467,AUS,v3 +43.68,657,AUS,v3 +31.78,400,AUS,v3 +34.7,376,CAN,v1 +35.38,342,USA,v1 +24.02,657,USA,v1 +33.21,856,AUS,v3 +38.8,312,UK,v2 +32.42,166,AUS,v3 +38.28,720,USA,v1 +32.53,580,AUS,v4 +28.86,425,CAN,v2 +27.55,495,RUS,v4 +32.34,465,USA,v1 +35.3,517,UK,v3 +45.42,464,UK,v3 +45.11,836,CAN,v2 +41.88,141,AUS,v3 +37.68,764,CAN,v2 +37.36,936,USA,v1 +30.69,216,AUS,v3 +32.11,248,RUS,v4 +41.17,644,RUS,v4 +40.11,497,RUS,v4 +34.42,247,USA,v1 +31.76,864,CAN,v1 diff --git a/lib/src/test/resources/tiny.csv b/lib/src/test/resources/tiny.csv new file mode 100644 index 000000000..0d22cfcfa --- /dev/null +++ b/lib/src/test/resources/tiny.csv @@ -0,0 +1,4 @@ +usage,version,location +2.0,27,USA +3.1,27,CAN +4.0,28,USA \ No newline at end of file diff --git a/pom.xml b/pom.xml index fd7777c5f..8fcc964ce 100644 --- a/pom.xml +++ b/pom.xml @@ -1,12 +1,34 @@ 4.0.0 - macrobase + edu.stanford.futuredata macrobase pom 0.1-SNAPSHOT macrobase - http://maven.apache.org + https://github.com/stanford-futuredata/macrobase + MacroBase is an anomaly detection engine designed to prioritize human attention in large-scale datasets and data streams. + + + Apache License, Version 2.0 + http://www.apache.org/licenses/LICENSE-2.0.txt + repo + + + + + Peter Bailis + pbailis@cs.stanford.edu + Stanford + http://www.bailis.org + + + + scm:git:git://github.com/stanford-futuredata/macrobase.git + scm:git:ssh://github.com/stanford-futuredata/macrobase.git + https://github.com/stanford-futuredata/macrobase + + UTF-8 UTF-8 @@ -16,9 +38,10 @@ assembly - core + legacy frontend contrib + lib @@ -28,29 +51,9 @@ 4.12 test - - io.dropwizard - dropwizard-core - ${dropwizard.version} - - - io.dropwizard - dropwizard-assets - ${dropwizard.version} - - - io.dropwizard - dropwizard-logging - ${dropwizard.version} - - - io.dropwizard - dropwizard-db - ${dropwizard.version} - - + org.apache.maven.plugins