stanford-futuredata · jialinding · Aug 28, 2017 · Aug 31, 2017 · Aug 31, 2017
diff --git a/core/demo/demo_query.json b/core/demo/demo_query.json
@@ -10,6 +10,6 @@
 
   "summarizer": "apriori",
   "attributes": ["location", "version"],
-  "minRiskRatio": 10.0,
+  "minRatioMetric": 10.0,
   "minSupport": 0.2
 }
diff --git a/core/demo/sample_cubed.csv b/core/demo/sample_cubed.csv
@@ -1,10 +1,10 @@
-location,version,count,mean,std
-AUS,v3,150,35.7415333333,4.61047330283
-AUS,v4,50,34.0068,4.93254640534
-CAN,v1,50,34.0772,5.24978796073
-CAN,v2,150,34.9576666667,4.62355719076
-CAN,v3,20,8.26286,5.42913454695
-RUS,v4,200,35.69215,5.16468641899
-UK,v2,100,35.6926,4.55136401355
-UK,v3,100,34.4426,5.62178062665
-USA,v1,200,34.49175,5.11885885892
+,location,version,count,mean,std,min,max
+0,AUS,v3,150,35.74153333333334,4.610473302825061,26.28,47.89
+1,AUS,v4,50,34.006800000000005,4.932546405338355,22.74,46.61
+2,CAN,v1,79,401.20126582278453,472.14713946343613,18.72,1000.76
+3,CAN,v2,158,83.85955696202532,212.47244153214433,17.25,1000.77
+4,CAN,v3,20,8.26286,5.42913454694596,-0.335,18.85
+5,RUS,v4,200,35.69215000000002,5.164686418993188,25.83,50.04
+6,UK,v2,100,35.6926,4.551364013550046,21.48,46.49
+7,UK,v3,100,34.4426,5.621780626646813,18.75,45.73
+8,USA,v1,200,34.49175,5.118858858924315,17.24,45.43
diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/CubePipeline.java
@@ -1,6 +1,7 @@
 package edu.stanford.futuredata.macrobase.pipeline;
 
 import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier;
+import edu.stanford.futuredata.macrobase.analysis.classify.BoundsClassifier;
 import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier;
 import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier;
 import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
@@ -28,6 +29,8 @@ public class CubePipeline implements Pipeline {
     private String countColumn;
     private String meanColumn;
     private String stdColumn;
+    private String minColumn;
+    private String maxColumn;
     private LinkedHashMap<String, Double> quantileColumns;
     private double percentile;
     private boolean includeHi;
@@ -44,6 +47,8 @@ public CubePipeline(PipelineConfig conf) {
         countColumn = conf.get("countColumn", "count");
         meanColumn = conf.get("meanColumn", "mean");
         stdColumn = conf.get("stdColumn", "std");
+        minColumn = conf.get("minColumn", "min");
+        maxColumn = conf.get("maxColumn", "max");
         quantileColumns = conf.get("quantileColumns", new LinkedHashMap<String, Double>());
         percentile = conf.get("percentile", 1.0);
         includeHi = conf.get("includeHi", true);
@@ -105,6 +110,14 @@ private Map<String, Schema.ColType> getColTypes() throws MacrobaseException {
                 }
                 return colTypes;
             }
+            case "bounds": {
+                colTypes.put(countColumn, Schema.ColType.DOUBLE);
+                colTypes.put(meanColumn, Schema.ColType.DOUBLE);
+                colTypes.put(stdColumn, Schema.ColType.DOUBLE);
+                colTypes.put(minColumn, Schema.ColType.DOUBLE);
+                colTypes.put(maxColumn, Schema.ColType.DOUBLE);
+                return colTypes;
+            }
             default:
                 throw new MacrobaseException("Bad Classifier Name");
         }
@@ -128,6 +141,14 @@ private CubeClassifier getClassifier() throws MacrobaseException {
                 classifier.setIncludeLow(includeLo);
                 return classifier;
             }
+            case "bounds": {
+                BoundsClassifier classifier =
+                        new BoundsClassifier(countColumn, meanColumn, stdColumn, minColumn, maxColumn);
+                classifier.setPercentile(percentile);
+                classifier.setIncludeHigh(includeHi);
+                classifier.setIncludeLow(includeLo);
+                return classifier;
+            }
             default:
                 throw new MacrobaseException("Bad Classifier Name");
         }

diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifier.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifier.java
@@ -0,0 +1,226 @@
+package edu.stanford.futuredata.macrobase.analysis.classify;
+
+import edu.stanford.futuredata.macrobase.analysis.classify.stats.WeightedPercentile;
+import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
+
+/**
+ * Classify rows by high / low values based on bounds on the possible number of outliers.
+ * Bounds are constructed using the mean, standard deviation, minimum and maximum of each row.
+ * Returns a new dataframe with a column representation of the estimated number of outliers
+ * for each group, which can be non-integer.
+ */
+public class BoundsClassifier extends CubeClassifier implements ThresholdClassifier {
+    // Parameters
+    private String meanColumnName = "mean";
+    private String stdColumnName = "std";
+    private String minColumnName = "min";
+    private String maxColumnName = "max";
+    private double percentile = 1.0;
+    private boolean includeHigh = true;
+    private boolean includeLow = true;
+
+    // Calculated values
+    private double lowCutoff;
+    private double highCutoff;
+    private DataFrame output;
+
+    public BoundsClassifier(String countColumnName, String meanColumnName,
+                            String stdColumnName, String minColumnName, String maxColumnName) {
+        super(countColumnName);
+        this.meanColumnName = meanColumnName;
+        this.stdColumnName = stdColumnName;
+        this.minColumnName = minColumnName;
+        this.maxColumnName = maxColumnName;
+    }
+
+    @Override
+    public void process(DataFrame input) {
+        double[] means = input.getDoubleColumnByName(meanColumnName);
+        double[] counts = input.getDoubleColumnByName(countColumnName);
+        double[] stds = input.getDoubleColumnByName(stdColumnName);
+        double[] mins = input.getDoubleColumnByName(minColumnName);
+        double[] maxs = input.getDoubleColumnByName(maxColumnName);
+        int len = means.length;
+
+        WeightedPercentile wp = new WeightedPercentile(counts, means);
+        lowCutoff = wp.evaluate(percentile);
+        highCutoff = wp.evaluate(100.0 - percentile);
+
+        output = input.copy();
+        double[] resultColumn = new double[len];
+        for (int i = 0; i < len; i++) {
+            double mean = means[i];
+            double std = stds[i];
+            double count = counts[i];
+            double min = mins[i];
+            double max = maxs[i];
+            double numOutliers = 0.0;
+            if (Double.isNaN(std) || std == 0.0) {
+                // only one metric in group, or all metrics are equal
+                if ((includeHigh && mean > highCutoff)
+                        || (includeLow && mean < highCutoff)) {
+                    numOutliers += count;
+                }
+            } else {
+                // We use lower bounds as the heuristic for the number of outliers.
+                // The maxStdBound is based on the fact that the true standard devation must be
+                // no greater than the maximum possible standard deviation. It is computed using
+                // an inequality that extends Chebyshev's inequality to take advantage of knowing
+                // the minimum and maximum.
+                // The minStdBound is based on the fact that the true standard deviation must be
+                // no less than the minimum possible standard deviation. It is computed using
+                // Cantelli's inequality.
+                // The markovBound is based on Markov's inequality and makes use of the mean,
+                // maximum, and minimum.
+                if (includeHigh) {
+                    if (highCutoff >= max) {
+                        numOutliers += 0.0;
+                    } else if (highCutoff < min) {
+                        numOutliers += count;
+                    } else {
+                        double maxStdBound = (((std * std * (count - 1) / count) + (min - mean) * (highCutoff - mean)) /
+                                ((max - highCutoff) * (max - min)));
+                        double minStdBound;
+                        double markovBound = 1 - (max - mean) / (max - highCutoff);;
+                        if (mean <= highCutoff) {
+                            minStdBound = 0;
+                        } else {
+                            minStdBound = ((highCutoff - mean) * (highCutoff - mean)) /
+                                    (((highCutoff - mean) * (highCutoff - mean)) + (std * std * (count - 1) / count));
+                        }
+                        numOutliers += count * Math.max(maxStdBound, Math.max(minStdBound, markovBound));
+                    }
+                }
+                if (includeLow) {
+                    if (lowCutoff <= min) {
+                        numOutliers += 0.0;
+                    } else if (lowCutoff > max) {
+                        numOutliers += count;
+                    } else {
+                        double maxStdBound = (((std * std * (count - 1) / count) + (max - mean) * (lowCutoff - mean)) /
+                                ((min - lowCutoff) * (min - max)));
+                        double minStdBound;
+                        double markovBound = 1 - (mean - min) / (lowCutoff - min);;
+                        if (mean >= lowCutoff) {
+                            minStdBound = 0;
+                        } else {
+                            minStdBound = ((mean - lowCutoff) * (mean - lowCutoff)) /
+                                    (((mean - lowCutoff) * (mean - lowCutoff)) + (std * std * (count - 1) / count));
+                        }
+                        numOutliers += count * Math.max(maxStdBound, Math.max(minStdBound, markovBound));
+                    }
+                }
+            }
+            resultColumn[i] = numOutliers;
+        }
+        output.addDoubleColumn(outputColumnName, resultColumn);
+    }
+
+    @Override
+    public DataFrame getResults() {
+        return output;
+    }
+
+    // Parameter Getters and Setters
+    public double getPercentile() {
+        return percentile;
+    }
+
+    /**
+     * @param percentile Cutoff point for high or low values
+     * @return this
+     */
+    public BoundsClassifier setPercentile(double percentile) {
+        this.percentile = percentile;
+        return this;
+    }
+
+    public String getMeanColumnName() {
+        return meanColumnName;
+    }
+
+    /**
+     * @param meanColumnName Which column contains the mean of each row's attribute
+     *                       combination.
+     * @return this
+     */
+    public BoundsClassifier setMeanColumnName(String meanColumnName) {
+        this.meanColumnName = meanColumnName;
+        return this;
+    }
+
+    public String getStdColumnName() {
+        return stdColumnName;
+    }
+
+    /**
+     * @param stdColumnName Which column contains the standard deviation of metrics for events
+     *                      corresponding to each row's attribute combination. Assumed to contain
+     *                      the sample standard deviation.
+     * @return this
+     */
+    public BoundsClassifier setStdColumnName(String stdColumnName) {
+        this.stdColumnName = stdColumnName;
+        return this;
+    }
+
+    public String getMinColumnName() {
+        return minColumnName;
+    }
+
+    /**
+     * @param minColumnName Which column contains the minimum of metrics for events
+     *                      corresponding to each row's attribute combination.
+     * @return this
+     */
+    public BoundsClassifier setMinColumnName(String minColumnName) {
+        this.minColumnName = minColumnName;
+        return this;
+    }
+
+    public String getMaxColumnName() {
+        return maxColumnName;
+    }
+
+    /**
+     * @param maxColumnName Which column contains the maximum of metrics for events
+     *                      corresponding to each row's attribute combination.
+     * @return this
+     */
+    public BoundsClassifier setMaxColumnName(String maxColumnName) {
+        this.maxColumnName = maxColumnName;
+        return this;
+    }
+
+    public boolean isIncludeHigh() {
+        return includeHigh;
+    }
+
+    /**
+     * @param includeHigh Whether to count high points as outliers.
+     * @return this
+     */
+    public BoundsClassifier setIncludeHigh(boolean includeHigh) {
+        this.includeHigh = includeHigh;
+        return this;
+    }
+    public boolean isIncludeLow() {
+        return includeLow;
+    }
+
+    /**
+     * @param includeLow Whether to count low points as outliers
+     * @return this
+     */
+    public BoundsClassifier setIncludeLow(boolean includeLow) {
+        this.includeLow = includeLow;
+        return this;
+    }
+
+    public double getLowCutoff() {
+        return lowCutoff;
+    }
+    public double getHighCutoff() {
+        return highCutoff;
+    }
+}
diff --git a/...c/test/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifierTest.java b/...c/test/java/edu/stanford/futuredata/macrobase/analysis/classify/BoundsClassifierTest.java
@@ -0,0 +1,125 @@
+package edu.stanford.futuredata.macrobase.analysis.classify;
+
+import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
+import org.apache.commons.math3.stat.descriptive.rank.Percentile;
+import org.junit.Before;
+import org.junit.Test;
+
+import java.util.ArrayList;
+import java.util.LinkedHashMap;
+import java.util.List;
+import java.util.Map;
+
+import static org.junit.Assert.assertEquals;
+import static org.junit.Assert.assertTrue;
+
+public class BoundsClassifierTest {
+    private int length = 100;
+    private DataFrame df;
+    private double[] rawData;
+    private List<double[]> rawGroups;
+
+    @Before
+    public void setUp() {
+        df = new DataFrame();
+        double[] counts = new double[length];
+        double[] means = new double[length];
+        double[] stds = new double[length];
+        double[] mins = new double[length];
+        double[] maxs = new double[length];
+
+        rawData = new double[length*101];
+        rawGroups = new ArrayList<>();
+        int d = 0;
+        for (int i = 0; i < length; i++) {
+            double[] rawGroup = new double[101];
+            int g = 0;
+            for (int j = i-50; j <= i+50; j++) {
+                rawData[d++] = j;
+                rawGroup[g++] = j;
+            }
+            rawGroups.add(rawGroup);
+            counts[i] = 101;
+            means[i] = i;
+            mins[i] = i-50;
+            maxs[i] = i+50;
+            double std = 0.0;
+            for (double raw : rawGroup) {
+                std += (raw - i) * (raw - i);
+            }
+            stds[i] = Math.sqrt(std / (rawGroup.length - 1));
+        }
+        df.addDoubleColumn("count", counts);
+        df.addDoubleColumn("mean", means);
+        df.addDoubleColumn("std", stds);
+        df.addDoubleColumn("min", mins);
+        df.addDoubleColumn("max", maxs);
+    }
+
+    @Test
+    public void testClassify() throws Exception {
+        assertEquals(length, df.getNumRows());
+        BoundsClassifier bc = new BoundsClassifier("count", "mean", "std", "min", "max");
+        bc.process(df);
+        DataFrame output = bc.getResults();
+        assertEquals(df.getNumRows(), output.getNumRows());
+        assertEquals(5, df.getSchema().getNumColumns());
+        assertEquals(6, output.getSchema().getNumColumns());
+
+        Percentile percentile = new Percentile();
+        percentile.setData(rawData);
+        double trueLowCutoff = percentile.evaluate(1);
+        double trueHighCutoff = percentile.evaluate(99);
+        assertEquals(trueLowCutoff, bc.getLowCutoff(), 40.0);
+        assertEquals(trueHighCutoff, bc.getHighCutoff(), 40.0);
+
+        double[] outliers = output.getDoubleColumnByName("_OUTLIER");
+
+        for (int i = 0; i < outliers.length; i++) {
+            int trueNumOutliers = 0;
+            double[] rawGroup = rawGroups.get(i);
+            for (int j = 0; j < rawGroup.length; j++) {
+                if (rawGroup[j] < trueLowCutoff || rawGroup[j] > trueHighCutoff) {
+                    trueNumOutliers++;
+                }
+            }
+            assertEquals(trueNumOutliers, outliers[i], 10.0);
+        }
+    }
+
+    @Test
+    public void testConfigure() throws Exception {
+        BoundsClassifier bc = new BoundsClassifier("col1", "col2", "col3", "col4", "col5");
+        bc.setMeanColumnName("mean");
+        bc.setCountColumnName("count");
+        bc.setStdColumnName("std");
+        bc.setMinColumnName("min");
+        bc.setMaxColumnName("max");
+        bc.setIncludeHigh(false);
+        bc.setIncludeLow(true);
+        bc.setOutputColumnName("_OUT");
+        bc.setPercentile(5.0);
+
+        bc.process(df);
+        DataFrame output = bc.getResults();
+        assertEquals(df.getNumRows(), output.getNumRows());
+
+        Percentile percentile = new Percentile();
+        percentile.setData(rawData);
+        double trueLowCutoff = percentile.evaluate(5);
+        assertEquals(trueLowCutoff, bc.getLowCutoff(), 25.0);
+
+        double[] outliers = output.getDoubleColumnByName("_OUT");
+
+        for (int i = 0; i < outliers.length; i++) {
+            int trueNumOutliers = 0;
+            double[] rawGroup = rawGroups.get(i);
+            for (int j = 0; j < rawGroup.length; j++) {
+                if (rawGroup[j] < trueLowCutoff) {
+                    trueNumOutliers++;
+                }
+            }
+            assertEquals(trueNumOutliers, outliers[i], 15.0);
+        }
+    }
+}