Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BoundsClassifier #215

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/demo/demo_query.json
Original file line number Diff line number Diff line change
@@ -10,6 +10,6 @@

"summarizer": "apriori",
"attributes": ["location", "version"],
"minRiskRatio": 10.0,
"minRatioMetric": 10.0,
"minSupport": 0.2
}
20 changes: 10 additions & 10 deletions core/demo/sample_cubed.csv
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
location,version,count,mean,std
AUS,v3,150,35.7415333333,4.61047330283
AUS,v4,50,34.0068,4.93254640534
CAN,v1,50,34.0772,5.24978796073
CAN,v2,150,34.9576666667,4.62355719076
CAN,v3,20,8.26286,5.42913454695
RUS,v4,200,35.69215,5.16468641899
UK,v2,100,35.6926,4.55136401355
UK,v3,100,34.4426,5.62178062665
USA,v1,200,34.49175,5.11885885892
,location,version,count,mean,std,min,max
0,AUS,v3,150,35.74153333333334,4.610473302825061,26.28,47.89
1,AUS,v4,50,34.006800000000005,4.932546405338355,22.74,46.61
2,CAN,v1,79,401.20126582278453,472.14713946343613,18.72,1000.76
3,CAN,v2,158,83.85955696202532,212.47244153214433,17.25,1000.77
4,CAN,v3,20,8.26286,5.42913454694596,-0.335,18.85
5,RUS,v4,200,35.69215000000002,5.164686418993188,25.83,50.04
6,UK,v2,100,35.6926,4.551364013550046,21.48,46.49
7,UK,v3,100,34.4426,5.621780626646813,18.75,45.73
8,USA,v1,200,34.49175,5.118858858924315,17.24,45.43
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package edu.stanford.futuredata.macrobase.pipeline;

import edu.stanford.futuredata.macrobase.analysis.classify.ArithmeticClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.BoundsClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.CubeClassifier;
import edu.stanford.futuredata.macrobase.analysis.classify.QuantileClassifier;
import edu.stanford.futuredata.macrobase.analysis.summary.Explanation;
@@ -28,6 +29,8 @@ public class CubePipeline implements Pipeline {
private String countColumn;
private String meanColumn;
private String stdColumn;
private String minColumn;
private String maxColumn;
private LinkedHashMap<String, Double> quantileColumns;
private double percentile;
private boolean includeHi;
@@ -44,6 +47,8 @@ public CubePipeline(PipelineConfig conf) {
countColumn = conf.get("countColumn", "count");
meanColumn = conf.get("meanColumn", "mean");
stdColumn = conf.get("stdColumn", "std");
minColumn = conf.get("minColumn", "min");
maxColumn = conf.get("maxColumn", "max");
quantileColumns = conf.get("quantileColumns", new LinkedHashMap<String, Double>());
percentile = conf.get("percentile", 1.0);
includeHi = conf.get("includeHi", true);
@@ -105,6 +110,14 @@ private Map<String, Schema.ColType> getColTypes() throws MacrobaseException {
}
return colTypes;
}
case "bounds": {
colTypes.put(countColumn, Schema.ColType.DOUBLE);
colTypes.put(meanColumn, Schema.ColType.DOUBLE);
colTypes.put(stdColumn, Schema.ColType.DOUBLE);
colTypes.put(minColumn, Schema.ColType.DOUBLE);
colTypes.put(maxColumn, Schema.ColType.DOUBLE);
return colTypes;
}
default:
throw new MacrobaseException("Bad Classifier Name");
}
@@ -128,6 +141,14 @@ private CubeClassifier getClassifier() throws MacrobaseException {
classifier.setIncludeLow(includeLo);
return classifier;
}
case "bounds": {
BoundsClassifier classifier =
new BoundsClassifier(countColumn, meanColumn, stdColumn, minColumn, maxColumn);
classifier.setPercentile(percentile);
classifier.setIncludeHigh(includeHi);
classifier.setIncludeLow(includeLo);
return classifier;
}
default:
throw new MacrobaseException("Bad Classifier Name");
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
package edu.stanford.futuredata.macrobase.analysis.classify;

import edu.stanford.futuredata.macrobase.analysis.classify.stats.WeightedPercentile;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;

/**
* Classify rows by high / low values based on bounds on the possible number of outliers.
* Bounds are constructed using the mean, standard deviation, minimum and maximum of each row.
* Returns a new dataframe with a column representation of the estimated number of outliers
* for each group, which can be non-integer.
*/
public class BoundsClassifier extends CubeClassifier implements ThresholdClassifier {
// Parameters
private String meanColumnName = "mean";
private String stdColumnName = "std";
private String minColumnName = "min";
private String maxColumnName = "max";
private double percentile = 1.0;
private boolean includeHigh = true;
private boolean includeLow = true;

// Calculated values
private double lowCutoff;
private double highCutoff;
private DataFrame output;

public BoundsClassifier(String countColumnName, String meanColumnName,
String stdColumnName, String minColumnName, String maxColumnName) {
super(countColumnName);
this.meanColumnName = meanColumnName;
this.stdColumnName = stdColumnName;
this.minColumnName = minColumnName;
this.maxColumnName = maxColumnName;
}

@Override
public void process(DataFrame input) {
double[] means = input.getDoubleColumnByName(meanColumnName);
double[] counts = input.getDoubleColumnByName(countColumnName);
double[] stds = input.getDoubleColumnByName(stdColumnName);
double[] mins = input.getDoubleColumnByName(minColumnName);
double[] maxs = input.getDoubleColumnByName(maxColumnName);
int len = means.length;

WeightedPercentile wp = new WeightedPercentile(counts, means);
lowCutoff = wp.evaluate(percentile);
highCutoff = wp.evaluate(100.0 - percentile);

output = input.copy();
double[] resultColumn = new double[len];
for (int i = 0; i < len; i++) {
double mean = means[i];
double std = stds[i];
double count = counts[i];
double min = mins[i];
double max = maxs[i];
double numOutliers = 0.0;
if (Double.isNaN(std) || std == 0.0) {
// only one metric in group, or all metrics are equal
if ((includeHigh && mean > highCutoff)
|| (includeLow && mean < highCutoff)) {
numOutliers += count;
}
} else {
// We use lower bounds as the heuristic for the number of outliers.
// The maxStdBound is based on the fact that the true standard devation must be
// no greater than the maximum possible standard deviation. It is computed using
// an inequality that extends Chebyshev's inequality to take advantage of knowing
// the minimum and maximum.
// The minStdBound is based on the fact that the true standard deviation must be
// no less than the minimum possible standard deviation. It is computed using
// Cantelli's inequality.
// The markovBound is based on Markov's inequality and makes use of the mean,
// maximum, and minimum.
if (includeHigh) {
if (highCutoff >= max) {
numOutliers += 0.0;
} else if (highCutoff < min) {
numOutliers += count;
} else {
double maxStdBound = (((std * std * (count - 1) / count) + (min - mean) * (highCutoff - mean)) /
((max - highCutoff) * (max - min)));
double minStdBound;
double markovBound = 1 - (max - mean) / (max - highCutoff);;
if (mean <= highCutoff) {
minStdBound = 0;
} else {
minStdBound = ((highCutoff - mean) * (highCutoff - mean)) /
(((highCutoff - mean) * (highCutoff - mean)) + (std * std * (count - 1) / count));
}
numOutliers += count * Math.max(maxStdBound, Math.max(minStdBound, markovBound));
}
}
if (includeLow) {
if (lowCutoff <= min) {
numOutliers += 0.0;
} else if (lowCutoff > max) {
numOutliers += count;
} else {
double maxStdBound = (((std * std * (count - 1) / count) + (max - mean) * (lowCutoff - mean)) /
((min - lowCutoff) * (min - max)));
double minStdBound;
double markovBound = 1 - (mean - min) / (lowCutoff - min);;
if (mean >= lowCutoff) {
minStdBound = 0;
} else {
minStdBound = ((mean - lowCutoff) * (mean - lowCutoff)) /
(((mean - lowCutoff) * (mean - lowCutoff)) + (std * std * (count - 1) / count));
}
numOutliers += count * Math.max(maxStdBound, Math.max(minStdBound, markovBound));
}
}
}
resultColumn[i] = numOutliers;
}
output.addDoubleColumn(outputColumnName, resultColumn);
}

@Override
public DataFrame getResults() {
return output;
}

// Parameter Getters and Setters
public double getPercentile() {
return percentile;
}

/**
* @param percentile Cutoff point for high or low values
* @return this
*/
public BoundsClassifier setPercentile(double percentile) {
this.percentile = percentile;
return this;
}

public String getMeanColumnName() {
return meanColumnName;
}

/**
* @param meanColumnName Which column contains the mean of each row's attribute
* combination.
* @return this
*/
public BoundsClassifier setMeanColumnName(String meanColumnName) {
this.meanColumnName = meanColumnName;
return this;
}

public String getStdColumnName() {
return stdColumnName;
}

/**
* @param stdColumnName Which column contains the standard deviation of metrics for events
* corresponding to each row's attribute combination. Assumed to contain
* the sample standard deviation.
* @return this
*/
public BoundsClassifier setStdColumnName(String stdColumnName) {
this.stdColumnName = stdColumnName;
return this;
}

public String getMinColumnName() {
return minColumnName;
}

/**
* @param minColumnName Which column contains the minimum of metrics for events
* corresponding to each row's attribute combination.
* @return this
*/
public BoundsClassifier setMinColumnName(String minColumnName) {
this.minColumnName = minColumnName;
return this;
}

public String getMaxColumnName() {
return maxColumnName;
}

/**
* @param maxColumnName Which column contains the maximum of metrics for events
* corresponding to each row's attribute combination.
* @return this
*/
public BoundsClassifier setMaxColumnName(String maxColumnName) {
this.maxColumnName = maxColumnName;
return this;
}

public boolean isIncludeHigh() {
return includeHigh;
}

/**
* @param includeHigh Whether to count high points as outliers.
* @return this
*/
public BoundsClassifier setIncludeHigh(boolean includeHigh) {
this.includeHigh = includeHigh;
return this;
}
public boolean isIncludeLow() {
return includeLow;
}

/**
* @param includeLow Whether to count low points as outliers
* @return this
*/
public BoundsClassifier setIncludeLow(boolean includeLow) {
this.includeLow = includeLow;
return this;
}

public double getLowCutoff() {
return lowCutoff;
}
public double getHighCutoff() {
return highCutoff;
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
package edu.stanford.futuredata.macrobase.analysis.classify;

import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import org.apache.commons.math3.stat.descriptive.rank.Percentile;
import org.junit.Before;
import org.junit.Test;

import java.util.ArrayList;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;

import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;

public class BoundsClassifierTest {
private int length = 100;
private DataFrame df;
private double[] rawData;
private List<double[]> rawGroups;

@Before
public void setUp() {
df = new DataFrame();
double[] counts = new double[length];
double[] means = new double[length];
double[] stds = new double[length];
double[] mins = new double[length];
double[] maxs = new double[length];

rawData = new double[length*101];
rawGroups = new ArrayList<>();
int d = 0;
for (int i = 0; i < length; i++) {
double[] rawGroup = new double[101];
int g = 0;
for (int j = i-50; j <= i+50; j++) {
rawData[d++] = j;
rawGroup[g++] = j;
}
rawGroups.add(rawGroup);
counts[i] = 101;
means[i] = i;
mins[i] = i-50;
maxs[i] = i+50;
double std = 0.0;
for (double raw : rawGroup) {
std += (raw - i) * (raw - i);
}
stds[i] = Math.sqrt(std / (rawGroup.length - 1));
}
df.addDoubleColumn("count", counts);
df.addDoubleColumn("mean", means);
df.addDoubleColumn("std", stds);
df.addDoubleColumn("min", mins);
df.addDoubleColumn("max", maxs);
}

@Test
public void testClassify() throws Exception {
assertEquals(length, df.getNumRows());
BoundsClassifier bc = new BoundsClassifier("count", "mean", "std", "min", "max");
bc.process(df);
DataFrame output = bc.getResults();
assertEquals(df.getNumRows(), output.getNumRows());
assertEquals(5, df.getSchema().getNumColumns());
assertEquals(6, output.getSchema().getNumColumns());

Percentile percentile = new Percentile();
percentile.setData(rawData);
double trueLowCutoff = percentile.evaluate(1);
double trueHighCutoff = percentile.evaluate(99);
assertEquals(trueLowCutoff, bc.getLowCutoff(), 40.0);
assertEquals(trueHighCutoff, bc.getHighCutoff(), 40.0);

double[] outliers = output.getDoubleColumnByName("_OUTLIER");

for (int i = 0; i < outliers.length; i++) {
int trueNumOutliers = 0;
double[] rawGroup = rawGroups.get(i);
for (int j = 0; j < rawGroup.length; j++) {
if (rawGroup[j] < trueLowCutoff || rawGroup[j] > trueHighCutoff) {
trueNumOutliers++;
}
}
assertEquals(trueNumOutliers, outliers[i], 10.0);
}
}

@Test
public void testConfigure() throws Exception {
BoundsClassifier bc = new BoundsClassifier("col1", "col2", "col3", "col4", "col5");
bc.setMeanColumnName("mean");
bc.setCountColumnName("count");
bc.setStdColumnName("std");
bc.setMinColumnName("min");
bc.setMaxColumnName("max");
bc.setIncludeHigh(false);
bc.setIncludeLow(true);
bc.setOutputColumnName("_OUT");
bc.setPercentile(5.0);

bc.process(df);
DataFrame output = bc.getResults();
assertEquals(df.getNumRows(), output.getNumRows());

Percentile percentile = new Percentile();
percentile.setData(rawData);
double trueLowCutoff = percentile.evaluate(5);
assertEquals(trueLowCutoff, bc.getLowCutoff(), 25.0);

double[] outliers = output.getDoubleColumnByName("_OUT");

for (int i = 0; i < outliers.length; i++) {
int trueNumOutliers = 0;
double[] rawGroup = rawGroups.get(i);
for (int j = 0; j < rawGroup.length; j++) {
if (rawGroup[j] < trueLowCutoff) {
trueNumOutliers++;
}
}
assertEquals(trueNumOutliers, outliers[i], 15.0);
}
}
}