diff --git a/core/demo/demo_query.json b/core/demo/demo_query.json index 4dfb18a67..c6c9b5bd1 100644 --- a/core/demo/demo_query.json +++ b/core/demo/demo_query.json @@ -10,6 +10,6 @@ "summarizer": "apriori", "attributes": ["location", "version"], - "minRiskRatio": 10.0, + "minRatioMetric": 10.0, "minSupport": 0.2 } diff --git a/core/demo/sample_cubed.csv b/core/demo/sample_cubed.csv index 973f8919a..54730ebb5 100644 --- a/core/demo/sample_cubed.csv +++ b/core/demo/sample_cubed.csv @@ -1,10 +1,10 @@ -location,version,count,mean,std -AUS,v3,150,35.7415333333,4.61047330283 -AUS,v4,50,34.0068,4.93254640534 -CAN,v1,50,34.0772,5.24978796073 -CAN,v2,150,34.9576666667,4.62355719076 -CAN,v3,20,8.26286,5.42913454695 -RUS,v4,200,35.69215,5.16468641899 -UK,v2,100,35.6926,4.55136401355 -UK,v3,100,34.4426,5.62178062665 -USA,v1,200,34.49175,5.11885885892 +,location,version,count,mean,std,min,max +0,AUS,v3,150,35.74153333333334,4.610473302825061,26.28,47.89 +1,AUS,v4,50,34.006800000000005,4.932546405338355,22.74,46.61 +2,CAN,v1,79,401.20126582278453,472.14713946343613,18.72,1000.76 +3,CAN,v2,158,83.85955696202532,212.47244153214433,17.25,1000.77 +4,CAN,v3,20,8.26286,5.42913454694596,-0.335,18.85 +5,RUS,v4,200,35.69215000000002,5.164686418993188,25.83,50.04 +6,UK,v2,100,35.6926,4.551364013550046,21.48,46.49 +7,UK,v3,100,34.4426,5.621780626646813,18.75,45.73 +8,USA,v1,200,34.49175,5.118858858924315,17.24,45.43 diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java index 773fb27c3..bc9affbe7 100644 --- a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java +++ b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java @@ -1,6 +1,7 @@ package edu.stanford.futuredata.macrobase.pipeline; import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier; +import edu.stanford.futuredata.macrobase.analysis.classify.BoundsClassifier; import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier; import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier; import edu.stanford.futuredata.macrobase.analysis.summary.Explanation; @@ -28,6 +29,8 @@ public class CubePipeline implements Pipeline { private String countColumn; private String meanColumn; private String stdColumn; + private String minColumn; + private String maxColumn; private LinkedHashMap quantileColumns; private double percentile; private boolean includeHi; @@ -44,6 +47,8 @@ public CubePipeline(PipelineConfig conf) { countColumn = conf.get("countColumn", "count"); meanColumn = conf.get("meanColumn", "mean"); stdColumn = conf.get("stdColumn", "std"); + minColumn = conf.get("minColumn", "min"); + maxColumn = conf.get("maxColumn", "max"); quantileColumns = conf.get("quantileColumns", new LinkedHashMap()); percentile = conf.get("percentile", 1.0); includeHi = conf.get("includeHi", true); @@ -105,6 +110,14 @@ private Map getColTypes() throws MacrobaseException { } return colTypes; } + case "bounds": { + colTypes.put(countColumn, Schema.ColType.DOUBLE); + colTypes.put(meanColumn, Schema.ColType.DOUBLE); + colTypes.put(stdColumn, Schema.ColType.DOUBLE); + colTypes.put(minColumn, Schema.ColType.DOUBLE); + colTypes.put(maxColumn, Schema.ColType.DOUBLE); + return colTypes; + } default: throw new MacrobaseException("Bad Classifier Name"); } @@ -128,6 +141,14 @@ private CubeClassifier getClassifier() throws MacrobaseException { classifier.setIncludeLow(includeLo); return classifier; } + case "bounds": { + BoundsClassifier classifier = + new BoundsClassifier(countColumn, meanColumn, stdColumn, minColumn, maxColumn); + classifier.setPercentile(percentile); + classifier.setIncludeHigh(includeHi); + classifier.setIncludeLow(includeLo); + return classifier; + } default: throw new MacrobaseException("Bad Classifier Name"); } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifier.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifier.java new file mode 100644 index 000000000..d76433076 --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifier.java @@ -0,0 +1,226 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.analysis.classify.stats.WeightedPercentile; +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; + +/** + * Classify rows by high / low values based on bounds on the possible number of outliers. + * Bounds are constructed using the mean, standard deviation, minimum and maximum of each row. + * Returns a new dataframe with a column representation of the estimated number of outliers + * for each group, which can be non-integer. + */ +public class BoundsClassifier extends CubeClassifier implements ThresholdClassifier { + // Parameters + private String meanColumnName = "mean"; + private String stdColumnName = "std"; + private String minColumnName = "min"; + private String maxColumnName = "max"; + private double percentile = 1.0; + private boolean includeHigh = true; + private boolean includeLow = true; + + // Calculated values + private double lowCutoff; + private double highCutoff; + private DataFrame output; + + public BoundsClassifier(String countColumnName, String meanColumnName, + String stdColumnName, String minColumnName, String maxColumnName) { + super(countColumnName); + this.meanColumnName = meanColumnName; + this.stdColumnName = stdColumnName; + this.minColumnName = minColumnName; + this.maxColumnName = maxColumnName; + } + + @Override + public void process(DataFrame input) { + double[] means = input.getDoubleColumnByName(meanColumnName); + double[] counts = input.getDoubleColumnByName(countColumnName); + double[] stds = input.getDoubleColumnByName(stdColumnName); + double[] mins = input.getDoubleColumnByName(minColumnName); + double[] maxs = input.getDoubleColumnByName(maxColumnName); + int len = means.length; + + WeightedPercentile wp = new WeightedPercentile(counts, means); + lowCutoff = wp.evaluate(percentile); + highCutoff = wp.evaluate(100.0 - percentile); + + output = input.copy(); + double[] resultColumn = new double[len]; + for (int i = 0; i < len; i++) { + double mean = means[i]; + double std = stds[i]; + double count = counts[i]; + double min = mins[i]; + double max = maxs[i]; + double numOutliers = 0.0; + if (Double.isNaN(std) || std == 0.0) { + // only one metric in group, or all metrics are equal + if ((includeHigh && mean > highCutoff) + || (includeLow && mean < highCutoff)) { + numOutliers += count; + } + } else { + // We use lower bounds as the heuristic for the number of outliers. + // The maxStdBound is based on the fact that the true standard devation must be + // no greater than the maximum possible standard deviation. It is computed using + // an inequality that extends Chebyshev's inequality to take advantage of knowing + // the minimum and maximum. + // The minStdBound is based on the fact that the true standard deviation must be + // no less than the minimum possible standard deviation. It is computed using + // Cantelli's inequality. + // The markovBound is based on Markov's inequality and makes use of the mean, + // maximum, and minimum. + if (includeHigh) { + if (highCutoff >= max) { + numOutliers += 0.0; + } else if (highCutoff < min) { + numOutliers += count; + } else { + double maxStdBound = (((std * std * (count - 1) / count) + (min - mean) * (highCutoff - mean)) / + ((max - highCutoff) * (max - min))); + double minStdBound; + double markovBound = 1 - (max - mean) / (max - highCutoff);; + if (mean <= highCutoff) { + minStdBound = 0; + } else { + minStdBound = ((highCutoff - mean) * (highCutoff - mean)) / + (((highCutoff - mean) * (highCutoff - mean)) + (std * std * (count - 1) / count)); + } + numOutliers += count * Math.max(maxStdBound, Math.max(minStdBound, markovBound)); + } + } + if (includeLow) { + if (lowCutoff <= min) { + numOutliers += 0.0; + } else if (lowCutoff > max) { + numOutliers += count; + } else { + double maxStdBound = (((std * std * (count - 1) / count) + (max - mean) * (lowCutoff - mean)) / + ((min - lowCutoff) * (min - max))); + double minStdBound; + double markovBound = 1 - (mean - min) / (lowCutoff - min);; + if (mean >= lowCutoff) { + minStdBound = 0; + } else { + minStdBound = ((mean - lowCutoff) * (mean - lowCutoff)) / + (((mean - lowCutoff) * (mean - lowCutoff)) + (std * std * (count - 1) / count)); + } + numOutliers += count * Math.max(maxStdBound, Math.max(minStdBound, markovBound)); + } + } + } + resultColumn[i] = numOutliers; + } + output.addDoubleColumn(outputColumnName, resultColumn); + } + + @Override + public DataFrame getResults() { + return output; + } + + // Parameter Getters and Setters + public double getPercentile() { + return percentile; + } + + /** + * @param percentile Cutoff point for high or low values + * @return this + */ + public BoundsClassifier setPercentile(double percentile) { + this.percentile = percentile; + return this; + } + + public String getMeanColumnName() { + return meanColumnName; + } + + /** + * @param meanColumnName Which column contains the mean of each row's attribute + * combination. + * @return this + */ + public BoundsClassifier setMeanColumnName(String meanColumnName) { + this.meanColumnName = meanColumnName; + return this; + } + + public String getStdColumnName() { + return stdColumnName; + } + + /** + * @param stdColumnName Which column contains the standard deviation of metrics for events + * corresponding to each row's attribute combination. Assumed to contain + * the sample standard deviation. + * @return this + */ + public BoundsClassifier setStdColumnName(String stdColumnName) { + this.stdColumnName = stdColumnName; + return this; + } + + public String getMinColumnName() { + return minColumnName; + } + + /** + * @param minColumnName Which column contains the minimum of metrics for events + * corresponding to each row's attribute combination. + * @return this + */ + public BoundsClassifier setMinColumnName(String minColumnName) { + this.minColumnName = minColumnName; + return this; + } + + public String getMaxColumnName() { + return maxColumnName; + } + + /** + * @param maxColumnName Which column contains the maximum of metrics for events + * corresponding to each row's attribute combination. + * @return this + */ + public BoundsClassifier setMaxColumnName(String maxColumnName) { + this.maxColumnName = maxColumnName; + return this; + } + + public boolean isIncludeHigh() { + return includeHigh; + } + + /** + * @param includeHigh Whether to count high points as outliers. + * @return this + */ + public BoundsClassifier setIncludeHigh(boolean includeHigh) { + this.includeHigh = includeHigh; + return this; + } + public boolean isIncludeLow() { + return includeLow; + } + + /** + * @param includeLow Whether to count low points as outliers + * @return this + */ + public BoundsClassifier setIncludeLow(boolean includeLow) { + this.includeLow = includeLow; + return this; + } + + public double getLowCutoff() { + return lowCutoff; + } + public double getHighCutoff() { + return highCutoff; + } +} diff --git a/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifierTest.java b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifierTest.java new file mode 100644 index 000000000..3d75975ba --- /dev/null +++ b/lib/src/test/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifierTest.java @@ -0,0 +1,125 @@ +package edu.stanford.futuredata.macrobase.analysis.classify; + +import edu.stanford.futuredata.macrobase.datamodel.DataFrame; +import org.apache.commons.math3.stat.descriptive.rank.Percentile; +import org.junit.Before; +import org.junit.Test; + +import java.util.ArrayList; +import java.util.LinkedHashMap; +import java.util.List; +import java.util.Map; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class BoundsClassifierTest { + private int length = 100; + private DataFrame df; + private double[] rawData; + private List rawGroups; + + @Before + public void setUp() { + df = new DataFrame(); + double[] counts = new double[length]; + double[] means = new double[length]; + double[] stds = new double[length]; + double[] mins = new double[length]; + double[] maxs = new double[length]; + + rawData = new double[length*101]; + rawGroups = new ArrayList<>(); + int d = 0; + for (int i = 0; i < length; i++) { + double[] rawGroup = new double[101]; + int g = 0; + for (int j = i-50; j <= i+50; j++) { + rawData[d++] = j; + rawGroup[g++] = j; + } + rawGroups.add(rawGroup); + counts[i] = 101; + means[i] = i; + mins[i] = i-50; + maxs[i] = i+50; + double std = 0.0; + for (double raw : rawGroup) { + std += (raw - i) * (raw - i); + } + stds[i] = Math.sqrt(std / (rawGroup.length - 1)); + } + df.addDoubleColumn("count", counts); + df.addDoubleColumn("mean", means); + df.addDoubleColumn("std", stds); + df.addDoubleColumn("min", mins); + df.addDoubleColumn("max", maxs); + } + + @Test + public void testClassify() throws Exception { + assertEquals(length, df.getNumRows()); + BoundsClassifier bc = new BoundsClassifier("count", "mean", "std", "min", "max"); + bc.process(df); + DataFrame output = bc.getResults(); + assertEquals(df.getNumRows(), output.getNumRows()); + assertEquals(5, df.getSchema().getNumColumns()); + assertEquals(6, output.getSchema().getNumColumns()); + + Percentile percentile = new Percentile(); + percentile.setData(rawData); + double trueLowCutoff = percentile.evaluate(1); + double trueHighCutoff = percentile.evaluate(99); + assertEquals(trueLowCutoff, bc.getLowCutoff(), 40.0); + assertEquals(trueHighCutoff, bc.getHighCutoff(), 40.0); + + double[] outliers = output.getDoubleColumnByName("_OUTLIER"); + + for (int i = 0; i < outliers.length; i++) { + int trueNumOutliers = 0; + double[] rawGroup = rawGroups.get(i); + for (int j = 0; j < rawGroup.length; j++) { + if (rawGroup[j] < trueLowCutoff || rawGroup[j] > trueHighCutoff) { + trueNumOutliers++; + } + } + assertEquals(trueNumOutliers, outliers[i], 10.0); + } + } + + @Test + public void testConfigure() throws Exception { + BoundsClassifier bc = new BoundsClassifier("col1", "col2", "col3", "col4", "col5"); + bc.setMeanColumnName("mean"); + bc.setCountColumnName("count"); + bc.setStdColumnName("std"); + bc.setMinColumnName("min"); + bc.setMaxColumnName("max"); + bc.setIncludeHigh(false); + bc.setIncludeLow(true); + bc.setOutputColumnName("_OUT"); + bc.setPercentile(5.0); + + bc.process(df); + DataFrame output = bc.getResults(); + assertEquals(df.getNumRows(), output.getNumRows()); + + Percentile percentile = new Percentile(); + percentile.setData(rawData); + double trueLowCutoff = percentile.evaluate(5); + assertEquals(trueLowCutoff, bc.getLowCutoff(), 25.0); + + double[] outliers = output.getDoubleColumnByName("_OUT"); + + for (int i = 0; i < outliers.length; i++) { + int trueNumOutliers = 0; + double[] rawGroup = rawGroups.get(i); + for (int j = 0; j < rawGroup.length; j++) { + if (rawGroup[j] < trueLowCutoff) { + trueNumOutliers++; + } + } + assertEquals(trueNumOutliers, outliers[i], 15.0); + } + } +} \ No newline at end of file