diff --git a/conf/CountMeanShift.json b/conf/CountMeanShift.json new file mode 100644 index 000000000..4559f32f8 --- /dev/null +++ b/conf/CountMeanShift.json @@ -0,0 +1,19 @@ +{ + "pipeline": "CubePipeline", + "inputURI": "csv://lib/src/test/resources/sample_cubedshift.csv", + "classifier": "countmeanshift", + "metric": "time", + "predicate": "==", + "cutoff": "1", + "meanColumn": "meanLatency", + "countColumn": "count", + "summarizer": "countmeanshift", + "attributes": [ + "location", + "version", + "language", + ], + "meanShiftRatio": 1.1, + "minSupport": 0.05, + "numThreads": 1 +} diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java index 421594ef2..ee3a3dd74 100644 --- a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java +++ b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java @@ -1,9 +1,9 @@ package edu.stanford.futuredata.macrobase.pipeline; -import edu.stanford.futuredata.macrobase.analysis.classify.Classifier; -import edu.stanford.futuredata.macrobase.analysis.classify.PercentileClassifier; -import edu.stanford.futuredata.macrobase.analysis.classify.PredicateClassifier; +import com.fasterxml.jackson.databind.jsonschema.SchemaAware; +import edu.stanford.futuredata.macrobase.analysis.classify.*; import edu.stanford.futuredata.macrobase.analysis.summary.Explanation; +import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLCountMeanShiftSummarizer; import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLOutlierSummarizer; import edu.stanford.futuredata.macrobase.analysis.summary.BatchSummarizer; import edu.stanford.futuredata.macrobase.analysis.summary.fpg.FPGrowthSummarizer; @@ -27,6 +27,7 @@ public class BasicBatchPipeline implements Pipeline { private String classifierType; private String metric; private double cutoff; + private Optional meanColumn; private String strCutoff; private boolean isStrPredicate; private boolean pctileHigh; @@ -39,6 +40,7 @@ public class BasicBatchPipeline implements Pipeline { private String ratioMetric; private double minSupport; private double minRiskRatio; + private double meanShiftRatio; public BasicBatchPipeline (PipelineConfig conf) { @@ -47,7 +49,7 @@ public BasicBatchPipeline (PipelineConfig conf) { classifierType = conf.get("classifier", "percentile"); metric = conf.get("metric"); - if (classifierType.equals("predicate")) { + if (classifierType.equals("predicate") || classifierType.equals("countmeanshift")){ Object rawCutoff = conf.get("cutoff"); isStrPredicate = rawCutoff instanceof String; if (isStrPredicate) { @@ -70,6 +72,8 @@ public BasicBatchPipeline (PipelineConfig conf) { minRiskRatio = conf.get("minRatioMetric", 3.0); minSupport = conf.get("minSupport", 0.01); numThreads = conf.get("numThreads", Runtime.getRuntime().availableProcessors()); + meanColumn = Optional.ofNullable(conf.get("meanColumn")); + meanShiftRatio = conf.get("meanShiftRatio", 1.0); } public Classifier getClassifier() throws MacroBaseException { @@ -81,6 +85,21 @@ public Classifier getClassifier() throws MacroBaseException { classifier.setIncludeLow(pctileLow); return classifier; } + case "countmeanshift": { + if (isStrPredicate) { + return new CountMeanShiftClassifier( + metric, + meanColumn.orElseThrow( + () -> new MacroBaseException("mean column not present in config")), predicateStr, + strCutoff); + } else { + return new CountMeanShiftClassifier( + metric, + meanColumn.orElseThrow( + () -> new MacroBaseException("mean column not present in config")), predicateStr, + cutoff); + } + } case "predicate": { if (isStrPredicate){ PredicateClassifier classifier = new PredicateClassifier(metric, predicateStr, strCutoff); @@ -116,6 +135,14 @@ public BatchSummarizer getSummarizer(String outlierColumnName) throws MacroBaseE summarizer.setNumThreads(numThreads); return summarizer; } + case "countmeanshift": { + APLCountMeanShiftSummarizer summarizer = new APLCountMeanShiftSummarizer(); + summarizer.setAttributes(attributes); + summarizer.setMinSupport(minSupport); + summarizer.setMinMeanShift(meanShiftRatio); + summarizer.setNumThreads(numThreads); + return summarizer; + } default: { throw new MacroBaseException("Bad Summarizer Type"); } @@ -131,6 +158,11 @@ public DataFrame loadData() throws Exception { colTypes.put(metric, Schema.ColType.DOUBLE); } List requiredColumns = new ArrayList<>(attributes); + if (meanColumn.isPresent()) { + colTypes.put(meanColumn.get(), Schema.ColType.DOUBLE); + requiredColumns.add(meanColumn.get()); + + } requiredColumns.add(metric); return PipelineUtils.loadDataFrame(inputURI, colTypes, requiredColumns); } diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java index 9c20b9757..5454e89e1 100644 --- a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java +++ b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java @@ -1,14 +1,7 @@ package edu.stanford.futuredata.macrobase.pipeline; -import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier; -import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier; -import edu.stanford.futuredata.macrobase.analysis.classify.PredicateCubeClassifier; -import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier; -import edu.stanford.futuredata.macrobase.analysis.classify.RawClassifier; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLExplanation; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLMeanSummarizer; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLOutlierSummarizer; -import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLSummarizer; +import edu.stanford.futuredata.macrobase.analysis.classify.*; +import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.*; import edu.stanford.futuredata.macrobase.datamodel.DataFrame; import edu.stanford.futuredata.macrobase.datamodel.Schema; import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameWriter; @@ -57,6 +50,7 @@ public class CubePipeline implements Pipeline { private List attributes; private double minSupport; private double minRatioMetric; + private double meanShiftRatio; private boolean debugDump; @@ -69,7 +63,7 @@ public CubePipeline(PipelineConfig conf) { classifierType = conf.get("classifier", "arithmetic"); countColumn = conf.get("countColumn", "count"); - if (classifierType.equals("predicate")) { + if (classifierType.equals("predicate") || classifierType.equals("countmeanshift")){ Object rawCutoff = conf.get("cutoff"); isStrPredicate = rawCutoff instanceof String; if (isStrPredicate) { @@ -94,6 +88,7 @@ public CubePipeline(PipelineConfig conf) { attributes = conf.get("attributes"); minSupport = conf.get("minSupport", 3.0); minRatioMetric = conf.get("minRatioMetric", 0.01); + meanShiftRatio = conf.get("meanShiftRatio", 1.0); numThreads = conf.get("numThreads", Runtime.getRuntime().availableProcessors()); debugDump = conf.get("debugDump", false); @@ -147,6 +142,20 @@ private Map getColTypes() throws MacroBaseException { Map colTypes = new HashMap<>(); colTypes.put(countColumn, Schema.ColType.DOUBLE); switch (classifierType) { + case "countmeanshift": + if (isStrPredicate) { + colTypes.put(metric.orElseThrow( + () -> new MacroBaseException("metric column not present in config")), + Schema.ColType.STRING); + } else { + colTypes.put(metric.orElseThrow( + () -> new MacroBaseException("metric column not present in config")), + Schema.ColType.DOUBLE); + } + colTypes.put(meanColumn + .orElseThrow(() -> new MacroBaseException("mean column not present in config")), + Schema.ColType.DOUBLE); + return colTypes; case "meanshift": case "arithmetic": { colTypes.put(meanColumn @@ -187,6 +196,23 @@ private Map getColTypes() throws MacroBaseException { private CubeClassifier getClassifier() throws MacroBaseException { switch (classifierType) { + case "countmeanshift": { + if (isStrPredicate) { + return new CountMeanShiftCubedClassifier(countColumn, + metric.orElseThrow( + () -> new MacroBaseException("metric column not present in config")), + meanColumn.orElseThrow( + () -> new MacroBaseException("mean column not present in config")), predicateStr, + strCutoff); + } else { + return new CountMeanShiftCubedClassifier(countColumn, + metric.orElseThrow( + () -> new MacroBaseException("metric column not present in config")), + meanColumn.orElseThrow( + () -> new MacroBaseException("mean column not present in config")), predicateStr, + cutoff); + } + } case "arithmetic": { ArithmeticClassifier classifier = new ArithmeticClassifier(countColumn, meanColumn.orElseThrow( @@ -234,6 +260,14 @@ private CubeClassifier getClassifier() throws MacroBaseException { private APLSummarizer getSummarizer(CubeClassifier classifier) throws Exception { switch (classifierType) { + case "countmeanshift": { + APLCountMeanShiftSummarizer summarizer = new APLCountMeanShiftSummarizer(); + summarizer.setAttributes(attributes); + summarizer.setMinSupport(minSupport); + summarizer.setMinMeanShift(meanShiftRatio); + summarizer.setNumThreads(numThreads); + return summarizer; + } case "meanshift": { APLMeanSummarizer summarizer = new APLMeanSummarizer(); summarizer.setCountColumn(countColumn); @@ -244,6 +278,7 @@ private APLSummarizer getSummarizer(CubeClassifier classifier) throws Exception summarizer.setAttributes(attributes); summarizer.setMinSupport(minSupport); summarizer.setMinStdDev(minRatioMetric); + summarizer.setNumThreads(numThreads); return summarizer; } default: { diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftClassifier.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftClassifier.java new file mode 100644 index 000000000..b98f003b1 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftClassifier.java @@ -0,0 +1,103 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.analysis.classify.stats.MBPredicate; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.util.MacroBaseException; + +import java.util.function.DoublePredicate; +import java.util.function.Predicate; + +public class CountMeanShiftClassifier extends Classifier { + private Predicate strPredicate; + private DoublePredicate doublePredicate; + private DataFrame output; + private String metricColumnName; + private String meanColumnName; + private boolean isStrPredicate; + public static String outlierCountColumnName = "_OUTLIERCOUNT"; + public static String inlierCountColumnName = "_INLIERCOUNT"; + public static String outlierMeanSumColumnName = "_OUTLIERMEANSUM"; + public static String inlierMeanSumColumnName = "_INLIERMEANSUM"; + + /** + * @param metricColumnName Column on which to classify outliers + * @param meanColumnName Column containing means whose shifts will be explained + * @param predicateStr Predicate used for classification: "==" or "!=" + * @param sentinel String sentinel value used when evaluating the predicate to determine outlier + * @throws MacroBaseException + */ + public CountMeanShiftClassifier( + final String metricColumnName, + final String meanColumnName, + final String predicateStr, + final String sentinel + ) throws MacroBaseException { + super(meanColumnName); + this.metricColumnName = metricColumnName; + this.meanColumnName = meanColumnName; + this.strPredicate = MBPredicate.getStrPredicate(predicateStr, sentinel); + this.isStrPredicate = true; + } + + /** + * @param metricColumnName Column on which to classify outliers + * @param meanColumnName Column containing means whose shifts will be explained + * @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">=" + * @param sentinel Double sentinel value used when evaluating the predicate to determine outlier + */ + public CountMeanShiftClassifier( + final String metricColumnName, + final String meanColumnName, + final String predicateStr, + final double sentinel + ) throws MacroBaseException { + super(meanColumnName); + this.metricColumnName = metricColumnName; + this.meanColumnName = meanColumnName; + this.doublePredicate = MBPredicate.getDoublePredicate(predicateStr, sentinel); + this.isStrPredicate = false; + } + + /** + * Scan through the metric column, and evaluate the predicate on every value in the column. The ``input'' DataFrame + * remains unmodified; a copy is created and all modifications are made on the copy. Then store counts and + * meancounts for both outliers and inliers. + * @throws Exception + */ + @Override + public void process(DataFrame input) throws Exception { + String[] stringMetrics = null; + if (isStrPredicate) + stringMetrics = input.getStringColumnByName(metricColumnName); + double[] doubleMetrics = null; + if (!isStrPredicate) + doubleMetrics = input.getDoubleColumnByName(metricColumnName); + output = input.copy(); + double[] totalMeanColumn = input.getDoubleColumnByName(meanColumnName); + int len = totalMeanColumn.length; + double[] outlierCountColumn = new double[len]; + double[] inlierCountColumn = new double[len]; + double[] outlierMeanColumn = new double[len]; + double[] inlierMeanColumn = new double[len]; + for (int i = 0; i < len; i++) { + if ((isStrPredicate && strPredicate.test(stringMetrics[i])) || + (!isStrPredicate && doublePredicate.test(doubleMetrics[i]))) { + outlierCountColumn[i] = 1.0; + outlierMeanColumn[i] = totalMeanColumn[i]; + } else { + inlierCountColumn[i] = 1.0; + inlierMeanColumn[i] = totalMeanColumn[i]; + } + } + output.addColumn(outlierCountColumnName, outlierCountColumn); + output.addColumn(inlierCountColumnName, inlierCountColumn); + output.addColumn(outlierMeanSumColumnName, outlierMeanColumn); + output.addColumn(inlierMeanSumColumnName, inlierMeanColumn); + } + + + @Override + public DataFrame getResults() { + return output; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftCubedClassifier.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftCubedClassifier.java new file mode 100644 index 000000000..1d1f5bc8c --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftCubedClassifier.java @@ -0,0 +1,104 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.analysis.classify.stats.MBPredicate; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.util.MacroBaseException; + +import java.util.function.DoublePredicate; +import java.util.function.Predicate; + +public class CountMeanShiftCubedClassifier extends CubeClassifier { + private Predicate strPredicate; + private DoublePredicate doublePredicate; + private DataFrame output; + private String metricColumnName; + private String meanColumnName; + private boolean isStrPredicate; + public static String outlierCountColumnName = CountMeanShiftClassifier.outlierCountColumnName; + public static String inlierCountColumnName = CountMeanShiftClassifier.inlierCountColumnName; + public static String outlierMeanSumColumnName = CountMeanShiftClassifier.outlierMeanSumColumnName; + public static String inlierMeanSumColumnName = CountMeanShiftClassifier.inlierMeanSumColumnName; + + /** + * @param countColumnName Column containing per-row counts + * @param metricColumnName Column on which to classify outliers + * @param meanColumnName Column containing means whose shifts will be explained + * @param predicateStr Predicate used for classification: "==" or "!=" + * @param sentinel String sentinel value used when evaluating the predicate to determine outlier + * @throws MacroBaseException + */ + public CountMeanShiftCubedClassifier( + final String countColumnName, + final String metricColumnName, + final String meanColumnName, + final String predicateStr, + final String sentinel + ) throws MacroBaseException { + super(countColumnName); + this.metricColumnName = metricColumnName; + this.meanColumnName = meanColumnName; + this.strPredicate = MBPredicate.getStrPredicate(predicateStr, sentinel); + this.isStrPredicate = true; + } + + /** + * @param countColumnName Column containing per-row counts + * @param metricColumnName Column on which to classify outliers + * @param meanColumnName Column containing means whose shifts will be explained + * @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">=" + * @param sentinel Double sentinel value used when evaluating the predicate to determine outlier + */ + public CountMeanShiftCubedClassifier( + final String countColumnName, + final String metricColumnName, + final String meanColumnName, + final String predicateStr, + final double sentinel + ) throws MacroBaseException { + super(countColumnName); + this.metricColumnName = metricColumnName; + this.meanColumnName = meanColumnName; + this.doublePredicate = MBPredicate.getDoublePredicate(predicateStr, sentinel); + this.isStrPredicate = false; + } + + /** + * Scan through the metric column, and evaluate the predicate on every value in the column. The ``input'' DataFrame + * remains unmodified; a copy is created and all modifications are made on the copy. Then store counts and + * meancounts for both outliers and inliers. + * @throws Exception + */ + @Override + public void process(DataFrame input) throws Exception { + String[] stringMetrics = input.getStringColumnByName(metricColumnName); + double[] doubleMetrics = input.getDoubleColumnByName(metricColumnName); + output = input.copy(); + double[] totalCountColumn = input.getDoubleColumnByName(getCountColumnName()); + double[] totalMeanColumn = input.getDoubleColumnByName(meanColumnName); + int len = totalCountColumn.length; + double[] outlierCountColumn = new double[len]; + double[] inlierCountColumn = new double[len]; + double[] outlierMeanSumColumn = new double[len]; + double[] inlierMeanSumColumn = new double[len]; + for (int i = 0; i < len; i++) { + if ((isStrPredicate && strPredicate.test(stringMetrics[i])) || + (!isStrPredicate && doublePredicate.test(doubleMetrics[i]))) { + outlierCountColumn[i] = totalCountColumn[i]; + outlierMeanSumColumn[i] = totalMeanColumn[i] * totalCountColumn[i]; + } else { + inlierCountColumn[i] = totalCountColumn[i]; + inlierMeanSumColumn[i] = totalMeanColumn[i] * totalCountColumn[i]; + } + } + output.addColumn(outlierCountColumnName, outlierCountColumn); + output.addColumn(inlierCountColumnName, inlierCountColumn); + output.addColumn(outlierMeanSumColumnName, outlierMeanSumColumn); + output.addColumn(inlierMeanSumColumnName, inlierMeanSumColumn); + } + + + @Override + public DataFrame getResults() { + return output; + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLCountMeanShiftSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLCountMeanShiftSummarizer.java new file mode 100644 index 000000000..adf6d5a00 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLCountMeanShiftSummarizer.java @@ -0,0 +1,91 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.aplinear; + +import edu.stanford.futuredata.macrobase.analysis.classify.CountMeanShiftCubedClassifier; +import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.AggregationOp; +import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.MeanShiftQualityMetric; +import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.QualityMetric; +import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.SupportQualityMetric; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.List; + +/** + * Summarizer that measures the shift of the mean of some value from the inlier + * population to the outlier population. Explanations return all sets of attributes with min + * support among both inlier and outlier population where the shift of the mean of a value + * from the inliers to the outliers passes some threshold. + */ +public class APLCountMeanShiftSummarizer extends APLSummarizer { + private Logger log = LoggerFactory.getLogger("APLMeanSummarizer"); + + private double minMeanShift = 1.0; + + @Override + public List getAggregateNames() { + return Arrays.asList("outlierCount", "inlierCount", "outlierMeanSum", "inlierMeanSum"); + } + + @Override + public AggregationOp[] getAggregationOps() { + AggregationOp[] curOps = {AggregationOp.SUM, AggregationOp.SUM, AggregationOp.SUM, AggregationOp.SUM}; + return curOps; + } + + @Override + public int[][] getEncoded(List columns, DataFrame input) { + return encoder.encodeAttributesAsArray(columns); + } + + @Override + public double[][] getAggregateColumns(DataFrame input) { + double[] outlierCountColumn = input.getDoubleColumnByName(CountMeanShiftCubedClassifier.outlierCountColumnName); + double[] inlierCountColumn = input.getDoubleColumnByName(CountMeanShiftCubedClassifier.inlierCountColumnName); + double[] outlierMeanSumColumn = input.getDoubleColumnByName(CountMeanShiftCubedClassifier.outlierMeanSumColumnName); + double[] inlierMeanSumColumn = input.getDoubleColumnByName(CountMeanShiftCubedClassifier.inlierMeanSumColumnName); + + double[][] aggregateColumns = new double[4][]; + aggregateColumns[0] = outlierCountColumn; + aggregateColumns[1] = inlierCountColumn; + aggregateColumns[2] = outlierMeanSumColumn; + aggregateColumns[3] = inlierMeanSumColumn; + + return aggregateColumns; + } + + @Override + public List getQualityMetricList() { + List qualityMetricList = new ArrayList<>(); + qualityMetricList.add( + new SupportQualityMetric(0) + ); + qualityMetricList.add( + new SupportQualityMetric(1) + ); + qualityMetricList.add( + new MeanShiftQualityMetric(0, 1, 2, 3) + ); + return qualityMetricList; + } + + @Override + public List getThresholds() { + return Arrays.asList(minOutlierSupport, minOutlierSupport, minMeanShift); + } + + @Override + public double getNumberOutliers(double[][] aggregates) { + double sum = 0; + for (double outlierCount: aggregates[0]) + sum += outlierCount; + return sum; + } + + public void setMinMeanShift(double minMeanShift) { + this.minMeanShift = minMeanShift; + } + +} \ No newline at end of file diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/qualitymetrics/MeanShiftQualityMetric.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/qualitymetrics/MeanShiftQualityMetric.java new file mode 100644 index 000000000..73174e222 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/qualitymetrics/MeanShiftQualityMetric.java @@ -0,0 +1,41 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics; + +/** + * Measures the relative shift of the mean of a value from the inlier to + * the outlier population. + */ +public class MeanShiftQualityMetric implements QualityMetric { + private int oCountIdx, iCountIdx, oMeanCountIdx, iMeanCountIdx; + + public MeanShiftQualityMetric( + int oCountIdx, + int iCountIdx, + int oMeanCountIdx, + int iMeanCountIdx + ) { + this.oCountIdx = oCountIdx; + this.iCountIdx = iCountIdx; + this.oMeanCountIdx = oMeanCountIdx; + this.iMeanCountIdx = iMeanCountIdx; + } + + @Override + public String name() { + return "mean_shift"; + } + + @Override + public QualityMetric initialize(double[] globalAggregates) { + return null; + } + + @Override + public double value(double[] aggregates) { + return (aggregates[oMeanCountIdx]/aggregates[oCountIdx]) / (aggregates[iMeanCountIdx]/aggregates[iCountIdx]); + } + + @Override + public boolean isMonotonic() { + return false; + } +} \ No newline at end of file diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftCubedClassifierTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftCubedClassifierTest.java new file mode 100644 index 000000000..370c6e989 --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/CountMeanShiftCubedClassifierTest.java @@ -0,0 +1,81 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameParser; +import org.junit.Test; + +import java.util.*; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class CountMeanShiftCubedClassifierTest { + + @Test + public void testClassifyStrPredicate() throws Exception { + DataFrame df; + List requiredColumns = new ArrayList<>(Arrays.asList("time", "location", "version", "count", "meanLatency")); + Map colTypes = new HashMap<>(); + colTypes.put("time", Schema.ColType.STRING); + colTypes.put("count", Schema.ColType.DOUBLE); + colTypes.put("meanLatency", Schema.ColType.DOUBLE); + CSVDataFrameParser loader = new CSVDataFrameParser("src/test/resources/sample_cubedshift.csv", requiredColumns); + loader.setColumnTypes(colTypes); + df = loader.load(); + assertEquals(9, df.getNumRows()); + CountMeanShiftCubedClassifier pc = new CountMeanShiftCubedClassifier("count", "time", "meanLatency", "==", "1"); + pc.process(df); + DataFrame output = pc.getResults(); + assertEquals(df.getNumRows(), output.getNumRows()); + assertEquals(5, df.getSchema().getNumColumns()); + assertEquals(9, output.getSchema().getNumColumns()); + + + double[] outlierCountColumn = output.getDoubleColumnByName(CountMeanShiftCubedClassifier.outlierCountColumnName); + double[] inlierMeanColumn = output.getDoubleColumnByName(CountMeanShiftCubedClassifier.inlierMeanSumColumnName); + + assertEquals(150, outlierCountColumn[0], 0.1); + assertEquals(0, outlierCountColumn[1], 0.1); + assertEquals(0, outlierCountColumn[8], 0.1); + assertEquals(1, outlierCountColumn[6], 0.1); + assertEquals(0, inlierMeanColumn[2], 0.1); + assertEquals(20 * 180, inlierMeanColumn[3], 0.1); + assertEquals(25 * 200, inlierMeanColumn[5], 0.1); + assertEquals(100 * 2, inlierMeanColumn[7], 0.1); + } + + @Test + public void testClassifyDoublePredicate() throws Exception { + DataFrame df; + List requiredColumns = new ArrayList<>(Arrays.asList("time", "location", "version", "count", "meanLatency")); + Map colTypes = new HashMap<>(); + colTypes.put("time", Schema.ColType.DOUBLE); + colTypes.put("count", Schema.ColType.DOUBLE); + colTypes.put("meanLatency", Schema.ColType.DOUBLE); + CSVDataFrameParser loader = new CSVDataFrameParser("src/test/resources/sample_cubedshift.csv", requiredColumns); + loader.setColumnTypes(colTypes); + df = loader.load(); + assertEquals(9, df.getNumRows()); + CountMeanShiftCubedClassifier pc = new CountMeanShiftCubedClassifier("count", "time", "meanLatency", "==", 1.0); + pc.process(df); + DataFrame output = pc.getResults(); + assertEquals(df.getNumRows(), output.getNumRows()); + assertEquals(5, df.getSchema().getNumColumns()); + assertEquals(9, output.getSchema().getNumColumns()); + + + double[] outlierCountColumn = output.getDoubleColumnByName(CountMeanShiftCubedClassifier.outlierCountColumnName); + double[] inlierMeanColumn = output.getDoubleColumnByName(CountMeanShiftCubedClassifier.inlierMeanSumColumnName); + + assertEquals(150, outlierCountColumn[0], 0.1); + assertEquals(0, outlierCountColumn[1], 0.1); + assertEquals(0, outlierCountColumn[8], 0.1); + assertEquals(1, outlierCountColumn[6], 0.1); + assertEquals(0, inlierMeanColumn[2], 0.1); + assertEquals(20 * 180, inlierMeanColumn[3], 0.1); + assertEquals(25 * 200, inlierMeanColumn[5], 0.1); + assertEquals(100 * 2, inlierMeanColumn[7], 0.1); + } + +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLCountMeanShiftSummarizerTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLCountMeanShiftSummarizerTest.java new file mode 100644 index 000000000..7b58078cc --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APLCountMeanShiftSummarizerTest.java @@ -0,0 +1,58 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.aplinear; + +import edu.stanford.futuredata.macrobase.analysis.classify.CountMeanShiftCubedClassifier; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import edu.stanford.futuredata.macrobase.datamodel.Schema; +import edu.stanford.futuredata.macrobase.ingest.CSVDataFrameParser; +import junit.framework.TestCase; +import org.junit.Before; +import org.junit.Test; + +import java.util.*; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class APLCountMeanShiftSummarizerTest { + private DataFrame df; + + @Before + public void setUp() throws Exception { + List requiredColumns = new ArrayList<>(Arrays.asList("time", "location", "version", "count", "language", "meanLatency")); + Map colTypes = new HashMap<>(); + colTypes.put("time", Schema.ColType.STRING); + colTypes.put("count", Schema.ColType.DOUBLE); + colTypes.put("meanLatency", Schema.ColType.DOUBLE); + CSVDataFrameParser loader = new CSVDataFrameParser("src/test/resources/sample_cubedshift.csv", requiredColumns); + loader.setColumnTypes(colTypes); + df = loader.load(); + } + + @Test + public void testSummarize() throws Exception { + assertEquals(9, df.getNumRows()); + CountMeanShiftCubedClassifier pc = new CountMeanShiftCubedClassifier("count", "time", "meanLatency", "==", "1"); + pc.process(df); + DataFrame output = pc.getResults(); + assertEquals(df.getNumRows(), output.getNumRows()); + + List explanationAttributes = Arrays.asList( + "location", + "version", + "language" + ); + + APLCountMeanShiftSummarizer summ = new APLCountMeanShiftSummarizer(); + summ.setMinSupport(.05); + summ.setMinMeanShift(1.1); + summ.setAttributes(explanationAttributes); + summ.process(output); + APLExplanation e = summ.getResults(); + TestCase.assertEquals(3, e.getResults().size()); + assertTrue(e.prettyPrint().contains("location=AUS")); + assertTrue(e.prettyPrint().contains("version=v2")); + assertTrue(e.prettyPrint().contains("language=ENG")); + } + +} + diff --git a/lib/src/test/resources/sample_cubedshift.csv b/lib/src/test/resources/sample_cubedshift.csv new file mode 100644 index 000000000..3c8bed7f7 --- /dev/null +++ b/lib/src/test/resources/sample_cubedshift.csv @@ -0,0 +1,10 @@ +time,location,version,count,meanLatency,language +1,AUS,v3,150,37,ENG +2,AUS,v3,120,25,ENG +1,AUS,v2,150,45,ENG +2,AUS,v2,180,20,ENG +1,USA,v3,200,20,ENG +2,USA,v3,200,25,ENG +1,UK,v3,1,50,ENG +2,UK,v3,2,100,ENG +2,CAN,v3,200,30,ENG