diff --git a/.gitignore b/.gitignore index 01a0a8ed2..0939a5f86 100644 --- a/.gitignore +++ b/.gitignore @@ -1,8 +1,11 @@ +data/ +conf/ *.class .classpath .project .settings/* *.log +*.csv # Mobile Tools for Java (J2ME) .mtj.tmp/ diff --git a/.travis.yml b/.travis.yml index 2130674ed..96d9cd5e2 100644 --- a/.travis.yml +++ b/.travis.yml @@ -1,5 +1,5 @@ language: java -jdk: oraclejdk8 +jdk: openjdk11 install: mvn install -DskipTests=true -Dgpg.skip=true -Dmaven.javadoc.skip=true -B -V notifications: slack: stanford-futuredata:qmO6Keu8ifOyXHsmSQ97CeLH diff --git a/bin/batch.sh b/bin/batch.sh old mode 100755 new mode 100644 diff --git a/bin/cli.sh b/bin/cli.sh old mode 100755 new mode 100644 diff --git a/bin/frontend.sh b/bin/frontend.sh old mode 100755 new mode 100644 diff --git a/bin/macrobase-sql b/bin/macrobase-sql old mode 100755 new mode 100644 diff --git a/bin/server.sh b/bin/server.sh old mode 100755 new mode 100644 diff --git a/bin/streaming.sh b/bin/streaming.sh old mode 100755 new mode 100644 diff --git a/build.sh b/build.sh old mode 100755 new mode 100644 diff --git a/core/demo/query.sh b/core/demo/query.sh old mode 100755 new mode 100644 diff --git a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java index 1aa445dd1..bbc38c3e2 100644 --- a/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java +++ b/core/src/main/java/edu/stanford/futuredata/macrobase/pipeline/BasicBatchPipeline.java @@ -33,6 +33,7 @@ public class BasicBatchPipeline implements Pipeline { private boolean pctileLow; private String predicateStr; private int numThreads; + private int bitmapRatioThreshold; private String summarizerType; private List attributes; @@ -41,6 +42,9 @@ public class BasicBatchPipeline implements Pipeline { private double minRiskRatio; private double meanShiftRatio; + private boolean useFDs; + private int[] functionalDependencies; + public BasicBatchPipeline (PipelineConfig conf) { inputURI = conf.get("inputURI"); @@ -71,6 +75,22 @@ public BasicBatchPipeline (PipelineConfig conf) { minRiskRatio = conf.get("minRatioMetric", 3.0); minSupport = conf.get("minSupport", 0.01); numThreads = conf.get("numThreads", Runtime.getRuntime().availableProcessors()); + bitmapRatioThreshold = conf.get("bitmapRatioThreshold", 256); + + + //if FDs are behind used, parse them into bitmaps. For now, all FDs must be in the first 31 attributes + useFDs = conf.get("useFDs", false); + if (useFDs) { + ArrayList> rawDependencies = conf.get("functionalDependencies"); + functionalDependencies = new int[attributes.size()]; + for (ArrayList dependency : rawDependencies) { + for (int i : dependency) { + for (int j : dependency) { + if (i != j) functionalDependencies[i] |= (1 << j); + } + } + } + } meanColumn = Optional.ofNullable(conf.get("meanColumn")); meanShiftRatio = conf.get("meanShiftRatio", 1.0); } @@ -131,7 +151,10 @@ public BatchSummarizer getSummarizer(String outlierColumnName) throws MacroBaseE summarizer.setAttributes(attributes); summarizer.setMinSupport(minSupport); summarizer.setMinRatioMetric(minRiskRatio); + summarizer.setBitmapRatioThreshold(bitmapRatioThreshold); summarizer.setNumThreads(numThreads); + summarizer.setFDUsage(useFDs); + summarizer.setFDValues(functionalDependencies); return summarizer; } case "countmeanshift": { diff --git a/lib/genCP.sh b/lib/genCP.sh old mode 100755 new mode 100644 diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java index 5b6199ade..dde548607 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/BatchSummarizer.java @@ -23,6 +23,8 @@ public abstract class BatchSummarizer implements Operator qualityMetricList; List thresholds; - private double[][] globalAggregateCols; + private double[][] globalAggregateCols = null; protected long numEvents = 0; protected long numOutliers = 0; + protected int bitmapRatioThreshold = 256; public abstract List getAggregateNames(); public abstract AggregationOp[] getAggregationOps(); @@ -59,7 +61,7 @@ public void process(DataFrame input) throws Exception { int[][] encoded = getEncoded(input.getStringColsByName(attributes), input); long elapsed = System.currentTimeMillis() - startTime; log.info("Encoded in: {} ms", elapsed); - log.info("Distinct values encoded: {}", encoder.getNextKey() - 1); + log.info("Encoded Categories: {}", encoder.getNextKey() - 1); thresholds = getThresholds(); qualityMetricList = getQualityMetricList(); @@ -80,7 +82,10 @@ public void process(DataFrame input) throws Exception { numThreads, encoder.getBitmap(), encoder.getOutlierList(), - encoder.getIsBitmapEncodedArray() + encoder.getColCardinalities(), + useFDs, + functionalDependencies, + bitmapRatioThreshold ); log.info("Number of results: {}", aplResults.size()); numOutliers = (long)getNumberOutliers(aggregateColumns); @@ -99,8 +104,14 @@ public APLExplanation getResults() { return explanation; } + public void setBitmapRatioThreshold(int bitmapRatioThreshold) { + this.bitmapRatioThreshold = bitmapRatioThreshold; + } + public void setGlobalAggregateCols(double[][] globalAggregateCols) { this.globalAggregateCols = globalAggregateCols; } + + } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java index f466ce1d7..7321bc428 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/APrioriLinear.java @@ -7,6 +7,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.roaringbitmap.RoaringBitmap; +import org.w3c.dom.Attr; import java.util.*; import java.util.concurrent.CountDownLatch; @@ -32,6 +33,11 @@ public class APrioriLinear { // Aggregate values for all of the sets we saved private HashMap> savedAggregates; + /** + * @param qualityMetrics A list of all quality metrics for this DIFF + * operation. + * @param thresholds A list of the thresholds for each quality metric. + */ public APrioriLinear( List qualityMetrics, List thresholds @@ -45,6 +51,31 @@ public APrioriLinear( this.savedAggregates = new HashMap<>(3); } + /** + * Use Aprori to compute explanations for a DIFF. + * @param attributes Encoded columns to DIFF over. + * @param aggregateColumns Calculated aggregates for the quality metrics. + * @param aggregationOps Operations used to aggregate the aggregates. + * @param cardinality The total number of encoded attributes. + * @param maxOrder Maximum order of explanations to calculate. + * @param numThreads Number of threads to use. + * @param bitmap Bitmap representation of attributes. Stored as array indexed + * by column and then by outlier/inlier. Each entry in array + * is a map from encoded attribute value to the bitmap + * for that attribute among outliers or inliers. + * @param outlierList A list whose entries are arrays of all attributes in + * each column. + * @param colCardinalities An array containing the number of unique encoded + * attributes in each column. + * @param useFDs A boolean flag indicating whether or not to use functional + * dependency information. + * @param functionalDependencies An array whose entries are masks indicating + * which other columns a column is functionally + * determined by, if any. + * @param bitmapRatioThreshold The maximum product of column cardinalities for which + * a bitmap representation of the columns will be used. + * @return All explanations for the DIFF query. + */ public List explain( final int[][] attributes, double[][] aggregateColumns, @@ -53,9 +84,12 @@ public List explain( int cardinality, final int maxOrder, int numThreads, - HashMap[][] bitmap, + HashMap[][] bitmap, ArrayList[] outlierList, - boolean[] isBitmapEncoded + int[] colCardinalities, + boolean useFDs, + int[] functionalDependencies, + int bitmapRatioThreshold ) { final long beginTime = System.currentTimeMillis(); final int numAggregates = aggregateColumns.length; @@ -63,7 +97,7 @@ public List explain( final int numColumns = attributes[0].length; // Singleton viable sets for quick lookup - boolean[] singleNextArray = new boolean[cardinality]; + boolean[] singleNextArray = new boolean[cardinality];; // Maximum order of explanations. final boolean useIntSetAsArray; @@ -79,7 +113,7 @@ public List explain( // Shard the dataset by rows for the threads, but store it by column for fast processing final int[][][] byThreadAttributesTranspose = new int[numThreads][numColumns][(numRows + numThreads)/numThreads]; - final HashMap[][][] byThreadBitmap = new HashMap[numThreads][numColumns][2]; + final HashMap[][][] byThreadBitmap = new HashMap[numThreads][numColumns][2]; for (int i = 0; i < numThreads; i++) for (int j = 0; j < numColumns; j++) for (int k = 0; k < 2; k++) @@ -91,13 +125,11 @@ public List explain( for (int j = startIndex; j < endIndex; j++) { byThreadAttributesTranspose[threadNum][i][j - startIndex] = attributes[j][i]; } - if (isBitmapEncoded[i]) { + if (colCardinalities[i] < AttributeEncoder.cardinalityThreshold) { for (int j = 0; j < 2; j++) { - for (HashMap.Entry entry : bitmap[i][j].entrySet()) { - RoaringBitmap rr = new RoaringBitmap(); - rr.add((long) startIndex, (long) endIndex); - rr.and(entry.getValue()); - if (rr.getCardinality() > 0) { + for (HashMap.Entry entry : bitmap[i][j].entrySet()) { + ModBitSet rr = entry.getValue().get(startIndex, endIndex); + if (rr.cardinality() > 0) { byThreadBitmap[threadNum][i][j].put(entry.getKey(), rr); } } @@ -152,16 +184,18 @@ public List explain( curCandidate = new IntSetAsArray(0); if (curOrderFinal == 1) { for (int colNum = 0; colNum < numColumns; colNum++) { - if (isBitmapEncoded[colNum]) { + // Check whether or not to process using bitmaps + if (colCardinalities[colNum] < AttributeEncoder.cardinalityThreshold) { for (Integer curOutlierCandidate : outlierList[colNum]) { // Require that all order-one candidates have minimum support. if (curOutlierCandidate == AttributeEncoder.noSupport) continue; int outlierCount = 0, inlierCount = 0; + // Calculate aggregate values using bitmaps. if (byThreadBitmap[curThreadNum][colNum][1].containsKey(curOutlierCandidate)) - outlierCount = byThreadBitmap[curThreadNum][colNum][1].get(curOutlierCandidate).getCardinality(); + outlierCount = byThreadBitmap[curThreadNum][colNum][1].get(curOutlierCandidate).cardinality(); if (byThreadBitmap[curThreadNum][colNum][0].containsKey(curOutlierCandidate)) - inlierCount = byThreadBitmap[curThreadNum][colNum][0].get(curOutlierCandidate).getCardinality(); + inlierCount = byThreadBitmap[curThreadNum][colNum][0].get(curOutlierCandidate).cardinality(); // Cascade to arrays if necessary, but otherwise pack attributes into longs. if (useIntSetAsArray) { curCandidate = new IntSetAsArray(curOutlierCandidate); @@ -173,6 +207,7 @@ public List explain( } } else { int[] curColumnAttributes = byThreadAttributesTranspose[curThreadNum][colNum]; + // Calculate and update aggregate values via iteration, without bitmaps. for (int rowNum = startIndex; rowNum < endIndex; rowNum++) { // Require that all order-one candidates have minimum support. if (curColumnAttributes[rowNum - startIndex] == AttributeEncoder.noSupport) @@ -192,14 +227,21 @@ public List explain( for (int colNumOne = 0; colNumOne < numColumns; colNumOne++) { int[] curColumnOneAttributes = byThreadAttributesTranspose[curThreadNum][colNumOne]; for (int colNumTwo = colNumOne + 1; colNumTwo < numColumns; colNumTwo++) { + //if FDs are enabled, and these two attribute cols are FDs, skip + if (useFDs && ((functionalDependencies[colNumOne] & (1< explain( for (int colNumOne = 0; colNumOne < numColumns; colNumOne++) { int[] curColumnOneAttributes = byThreadAttributesTranspose[curThreadNum][colNumOne % numColumns]; for (int colNumTwo = colNumOne + 1; colNumTwo < numColumns; colNumTwo++) { + //if FD on and attributes 1 and 2 are FDs, skip + if (useFDs && ((functionalDependencies[colNumOne] & (1< explain( } else { throw new MacroBaseInternalError("High Order not supported"); } - log.info("Time spent in Thread {} in order {}: {} ms", - curThreadNum, curOrderFinal, System.currentTimeMillis() - startTime); doneSignal.countDown(); }; // Run numThreads lambdas in separate threads @@ -326,11 +379,10 @@ public List explain( singleNextArray[i.getFirst()] = true; } } + log.info("Time spent in order {}: {} ms", curOrderFinal, System.currentTimeMillis() - startTime); } - log.info("Time spent in APriori: {} ms", System.currentTimeMillis() - beginTime); - - + log.info("Time spent in APriori: {} ms", System.currentTimeMillis() - beginTime); List results = new ArrayList<>(); for (int curOrder: savedAggregates.keySet()) { Map curOrderSavedAggregates = savedAggregates.get(curOrder); @@ -355,22 +407,22 @@ public List explain( * @return Boolean */ private boolean allPairsValid(IntSet curCandidate, - HashSet o2Candidates) { - IntSet subPair; + HashSet o2Candidates) { + IntSet subPair; + subPair = new IntSetAsArray( + curCandidate.getFirst(), + curCandidate.getSecond()); + if (o2Candidates.contains(subPair)) { subPair = new IntSetAsArray( - curCandidate.getFirst(), - curCandidate.getSecond()); + curCandidate.getSecond(), + curCandidate.getThird()); if (o2Candidates.contains(subPair)) { subPair = new IntSetAsArray( - curCandidate.getSecond(), + curCandidate.getFirst(), curCandidate.getThird()); - if (o2Candidates.contains(subPair)) { - subPair = new IntSetAsArray( - curCandidate.getFirst(), - curCandidate.getThird()); - return o2Candidates.contains(subPair); - } + return o2Candidates.contains(subPair); } + } return false; } } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/BitmapHelperFunctions.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/BitmapHelperFunctions.java index 407d3d249..49e75b6ee 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/BitmapHelperFunctions.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/aplinear/BitmapHelperFunctions.java @@ -2,38 +2,62 @@ import edu.stanford.futuredata.macrobase.analysis.summary.util.*; import edu.stanford.futuredata.macrobase.analysis.summary.util.qualitymetrics.AggregationOp; -import org.roaringbitmap.RoaringBitmap; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; +import java.util.*; +import edu.stanford.futuredata.macrobase.analysis.summary.util.ModBitSet; public class BitmapHelperFunctions { + /** + * Update aggregates during Apriori. + * @param thisThreadSetAggregates A map from itemsets of attributes to arrays of aggregates. + * @param curCandidate An itemset of attributes. + * @param aggregationOps Aggregation functions used to perform updates. + * @param aggregateVal The array of aggregates to be aggregated onto the entry for curCandidate + * @param numAggregates The length of aggregateVal. + */ public static void updateAggregates(FastFixedHashTable thisThreadSetAggregates, IntSet curCandidate, AggregationOp[] aggregationOps, - double[] val, int numAggregates) { + double[] aggregateVal, int numAggregates) { double[] candidateVal = thisThreadSetAggregates.get(curCandidate); if (candidateVal == null) { thisThreadSetAggregates.put(curCandidate, - Arrays.copyOf(val, numAggregates)); + Arrays.copyOf(aggregateVal, numAggregates)); } else { for (int a = 0; a < numAggregates; a++) { AggregationOp curOp = aggregationOps[a]; - candidateVal[a] = curOp.combine(candidateVal[a], val[a]); + candidateVal[a] = curOp.combine(candidateVal[a], aggregateVal[a]); } } } /*********************** All Order-2 helper methods ***********************/ - // Two Normal columns + /** + * Iterate through two columns and update the map from attributes to aggregates + * with all pairs of attributes found during iteration. Process columns + * without using bitmap representations. + * @param thisThreadSetAggregates A map from itemsets of attributes to arrays of aggregates. + * @param curColumnOneAttributes The first column of attributes. + * @param curColumnTwoAttributes The second column of attributes. + * @param aggregationOps Aggregation functions used to perform updates. + * @param singleNextArray A list of supported singleton attributes. + * @param startIndex Where to begin iteration in the columns. + * @param endIndex Where to end iteration in the columns. Only values between + * startIndex and endIndex are considered. + * @param useIntSetAsArray Whether candidates are to be stored in packed longs + * or arrays of integers. + * @param curCandidate A dummy IntSet used as a single-entry pool to speed up computation + * by avoiding IntSet allocation. + * @param aRows An array of aggregate values. + * @param numAggregates The length of a row in aRows. + */ public static void allTwoNormal(FastFixedHashTable thisThreadSetAggregates, - int[] curColumnOneAttributes, int[] curColumnTwoAttributes, - AggregationOp[] aggregationOps, boolean[] singleNextArray, - int startIndex, int endIndex, - boolean useIntSetAsArray, IntSet curCandidate, - double[][] aRows, int numAggregates) { + int[] curColumnOneAttributes, int[] curColumnTwoAttributes, + AggregationOp[] aggregationOps, boolean[] singleNextArray, + int startIndex, int endIndex, + boolean useIntSetAsArray, IntSet curCandidate, + double[][] aRows, int numAggregates) { for (int rowNum = startIndex; rowNum < endIndex; rowNum++) { int rowNumInCol = rowNum - startIndex; // Only examine a pair if both its members have minimum support. @@ -54,13 +78,33 @@ public static void allTwoNormal(FastFixedHashTable thisThreadSetAggregates, } } - // Two bitmap columns + /** + * Process two columns and update the map from attributes to aggregates + * with all pairs of attributes found during iteration. Process columns + * using bitmap representations. + * @param thisThreadSetAggregates A map from itemsets of attributes to arrays of aggregates. + * @param outlierList A list whose entries are arrays of all attributes in + * each column. + * @param aggregationOps Aggregation functions used to perform updates. + * @param singleNextArray A list of supported singleton attributes. + * @param byThreadBitmap Bitmap representation of attributes. Stored as array indexed + * by column and then by outlier/inlier. Each entry in array + * is a map from encoded attribute value to the bitmap + * for that attribute among outliers or inliers. + * @param colNumOne The first column to process. + * @param colNumTwo The second column to process. + * @param useIntSetAsArray Whether candidates are to be stored in packed longs + * or arrays of integers. + * @param curCandidate A dummy IntSet used as a single-entry pool to speed up computation + * by avoiding IntSet allocation. + * @param numAggregates The length of a row in aRows. + */ public static void allTwoBitmap(FastFixedHashTable thisThreadSetAggregates, - ArrayList[] outlierList, - AggregationOp[] aggregationOps, boolean[] singleNextArray, - HashMap[][] byThreadBitmap, - int colNumOne, int colNumTwo, - boolean useIntSetAsArray, IntSet curCandidate, int numAggregates) { + ArrayList[] outlierList, + AggregationOp[] aggregationOps, boolean[] singleNextArray, + HashMap[][] byThreadBitmap, + int colNumOne, int colNumTwo, + boolean useIntSetAsArray, IntSet curCandidate, int numAggregates) { for (Integer curCandidateOne : outlierList[colNumOne]) { if (curCandidateOne == AttributeEncoder.noSupport || !singleNextArray[curCandidateOne]) continue; @@ -75,13 +119,15 @@ public static void allTwoBitmap(FastFixedHashTable thisThreadSetAggregates, } int outlierCount = 0, inlierCount = 0; if (byThreadBitmap[colNumOne][1].containsKey(curCandidateOne) && - byThreadBitmap[colNumTwo][1].containsKey(curCandidateTwo)) - outlierCount = RoaringBitmap.andCardinality(byThreadBitmap[colNumOne][1].get(curCandidateOne), + byThreadBitmap[colNumTwo][1].containsKey(curCandidateTwo)) { + outlierCount = ModBitSet.andCardinality(byThreadBitmap[colNumOne][1].get(curCandidateOne), byThreadBitmap[colNumTwo][1].get(curCandidateTwo)); + } if (byThreadBitmap[colNumOne][0].containsKey(curCandidateOne) && - byThreadBitmap[colNumTwo][0].containsKey(curCandidateTwo)) - inlierCount = RoaringBitmap.andCardinality(byThreadBitmap[colNumOne][0].get(curCandidateOne), + byThreadBitmap[colNumTwo][0].containsKey(curCandidateTwo)) { + inlierCount = ModBitSet.andCardinality(byThreadBitmap[colNumOne][0].get(curCandidateOne), byThreadBitmap[colNumTwo][0].get(curCandidateTwo)); + } updateAggregates(thisThreadSetAggregates, curCandidate, aggregationOps, new double[]{outlierCount, outlierCount + inlierCount}, numAggregates); } @@ -89,15 +135,34 @@ public static void allTwoBitmap(FastFixedHashTable thisThreadSetAggregates, } /*********************** All Order-3 helper methods ***********************/ - + /** + * Iterate through three columns and update the map from attributes to aggregates + * with all pairs of attributes found during iteration. Process columns + * without using bitmap representations. + * @param thisThreadSetAggregates A map from itemsets of attributes to arrays of aggregates. + * @param curColumnOneAttributes The first column of attributes. + * @param curColumnTwoAttributes The second column of attributes. + * @param curColumnThreeAttributes The third column of attributes. + * @param aggregationOps Aggregation functions used to perform updates. + * @param singleNextArray A list of supported singleton attributes. + * @param startIndex Where to begin iteration in the columns. + * @param endIndex Where to end iteration in the columns. Only values between + * startIndex and endIndex are considered. + * @param useIntSetAsArray Whether candidates are to be stored in packed longs + * or arrays of integers. + * @param curCandidate A dummy IntSet used as a single-entry pool to speed up computation + * by avoiding IntSet allocation. + * @param aRows An array of aggregate values. + * @param numAggregates The length of a row in aRows. + */ // All Three Normal or All Three Bitmap public static void allThreeNormal(FastFixedHashTable thisThreadSetAggregates, - int[] curColumnOneAttributes, int[] curColumnTwoAttributes, - int[] curColumnThreeAttributes, - AggregationOp[] aggregationOps, boolean[] singleNextArray, - int startIndex, int endIndex, - boolean useIntSetAsArray, IntSet curCandidate, - double[][] aRows, int numAggregates) { + int[] curColumnOneAttributes, int[] curColumnTwoAttributes, + int[] curColumnThreeAttributes, + AggregationOp[] aggregationOps, boolean[] singleNextArray, + int startIndex, int endIndex, + boolean useIntSetAsArray, IntSet curCandidate, + double[][] aRows, int numAggregates) { for (int rowNum = startIndex; rowNum < endIndex; rowNum++) { int rowNumInCol = rowNum - startIndex; // Only construct a triple if all its singleton members have minimum support. @@ -123,13 +188,34 @@ public static void allThreeNormal(FastFixedHashTable thisThreadSetAggregates, updateAggregates(thisThreadSetAggregates, curCandidate, aggregationOps, aRows[rowNum], numAggregates); } } - + /** + * Process three columns and update the map from attributes to aggregates + * with all pairs of attributes found during iteration. Process columns + * using bitmap representations. + * @param thisThreadSetAggregates A map from itemsets of attributes to arrays of aggregates. + * @param outlierList A list whose entries are arrays of all attributes in + * each column. + * @param aggregationOps Aggregation functions used to perform updates. + * @param singleNextArray A list of supported singleton attributes. + * @param byThreadBitmap Bitmap representation of attributes. Stored as array indexed + * by column and then by outlier/inlier. Each entry in array + * is a map from encoded attribute value to the bitmap + * for that attribute among outliers or inliers. + * @param colNumOne The first column to process. + * @param colNumTwo The second column to process. + * @param colNumThree The third column to process. + * @param useIntSetAsArray Whether candidates are to be stored in packed longs + * or arrays of integers. + * @param curCandidate A dummy IntSet used as a single-entry pool to speed up computation + * by avoiding IntSet allocation. + * @param numAggregates The length of a row in aRows. + */ public static void allThreeBitmap(FastFixedHashTable thisThreadSetAggregates, - ArrayList[] outlierList, - AggregationOp[] aggregationOps, boolean[] singleNextArray, - HashMap[][] byThreadBitmap, - int colNumOne, int colNumTwo, int colNumThree, - boolean useIntSetAsArray, IntSet curCandidate, int numAggregates) { + ArrayList[] outlierList, + AggregationOp[] aggregationOps, boolean[] singleNextArray, + HashMap[][] byThreadBitmap, + int colNumOne, int colNumTwo, int colNumThree, + boolean useIntSetAsArray, IntSet curCandidate, int numAggregates) { for (Integer curCandidateOne : outlierList[colNumOne]) { if (curCandidateOne == AttributeEncoder.noSupport || !singleNextArray[curCandidateOne]) @@ -157,17 +243,15 @@ public static void allThreeBitmap(FastFixedHashTable thisThreadSetAggregates, if (byThreadBitmap[colNumOne][1].containsKey(curCandidateOne) && byThreadBitmap[colNumTwo][1].containsKey(curCandidateTwo) && byThreadBitmap[colNumThree][1].containsKey(curCandidateThree)) { - outlierCount = RoaringBitmap.andCardinality( - RoaringBitmap.and(byThreadBitmap[colNumOne][1].get(curCandidateOne), - byThreadBitmap[colNumTwo][1].get(curCandidateTwo)), + outlierCount = ModBitSet.andCardinality(byThreadBitmap[colNumOne][1].get(curCandidateOne), + byThreadBitmap[colNumTwo][1].get(curCandidateTwo), byThreadBitmap[colNumThree][1].get(curCandidateThree)); } if (byThreadBitmap[colNumOne][0].containsKey(curCandidateOne) && byThreadBitmap[colNumTwo][0].containsKey(curCandidateTwo) && byThreadBitmap[colNumThree][0].containsKey(curCandidateThree)) { - inlierCount = RoaringBitmap.andCardinality( - RoaringBitmap.and(byThreadBitmap[colNumOne][0].get(curCandidateOne), - byThreadBitmap[colNumTwo][0].get(curCandidateTwo)), + inlierCount = ModBitSet.andCardinality(byThreadBitmap[colNumOne][0].get(curCandidateOne), + byThreadBitmap[colNumTwo][0].get(curCandidateTwo), byThreadBitmap[colNumThree][0].get(curCandidateThree)); } diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java index 195cda416..1ff1c4410 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/AttributeEncoder.java @@ -1,14 +1,8 @@ package edu.stanford.futuredata.macrobase.analysis.summary.util; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.List; -import java.util.Map; -import java.util.Set; +import java.util.*; import java.util.stream.Collectors; -import org.roaringbitmap.RoaringBitmap; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -19,11 +13,10 @@ * column values. */ public class AttributeEncoder { - private static final Logger log = LoggerFactory.getLogger("AttributeEncoder"); - + private Logger log = LoggerFactory.getLogger("AttributeEncoder"); // An encoding for values which do not satisfy the minimum support threshold in encodeAttributesWithSupport. public static int noSupport = Integer.MAX_VALUE; - private final int cardinalityThreshold = 5; + public static int cardinalityThreshold = 128; private HashMap> encoder; private int nextKey; @@ -31,9 +24,9 @@ public class AttributeEncoder { private HashMap valueDecoder; private HashMap columnDecoder; private List colNames; - private HashMap[][] bitmap; + private HashMap[][] bitmap; + private int[] colCardinalities; private ArrayList outlierList[]; - private boolean isBitmapEncoded[]; public AttributeEncoder() { encoder = new HashMap<>(); @@ -50,30 +43,29 @@ public void setColumnNames(List colNames) { public String decodeColumnName(int i) {return colNames.get(columnDecoder.get(i));} public String decodeValue(int i) {return valueDecoder.get(i);} public HashMap getColumnDecoder() {return columnDecoder;} - public HashMap[][] getBitmap() {return bitmap;} + public HashMap[][] getBitmap() {return bitmap;} public ArrayList[] getOutlierList() {return outlierList;} - public boolean[] getIsBitmapEncodedArray() {return isBitmapEncoded;} + public int[] getColCardinalities() {return colCardinalities;} /** - * Encodes columns giving each value which satisfies a minimum support threshold a key - * equal to its rank among all values which satisfy that threshold (so the single most common - * value has key 1, the next has key 2, and so on). Encode all values not satisfying the threshold - * as AttributeEncoder.noSupport. - * @param columns Columns to be encoded. - * @param minSupport Minimum support to be satisfied. - * @param outlierColumn The ith value in this array is the number of outliers whose attributes are those of - * row i of columns. - * @return A two-dimensional array of encoded values. + * Encode as integers all attributes satisfying a minimum support threshold. Also + * encode columns of attributes as bitmaps if their cardinalities are sufficiently + * low. + * @param columns A list of columns of attributes. + * @param minSupport The minimal support an attribute must have to be encoded. + * @param outlierColumn A column indicating whether a row of attributes is an inlier + * our outlier. + * @param useBitmaps Whether to encode any columns as bitmaps. + * @return The encoded matrix of attributes, stored as an array of arrays. */ public int[][] encodeAttributesWithSupport(List columns, double minSupport, - double[] outlierColumn, boolean useBitmaps) { + double[] outlierColumn, boolean useBitmaps) { if (columns.isEmpty()) { return new int[0][0]; } int numColumns = columns.size(); int numRows = columns.get(0).length; - log.info("numValuesEncoded: {}", numColumns*numRows); for (int i = 0; i < numColumns; i++) { if (!encoder.containsKey(i)) { @@ -126,7 +118,7 @@ public int[][] encodeAttributesWithSupport(List columns, double minSup outlierList = new ArrayList[numColumns]; for (int i = 0; i < numColumns; i++) outlierList[i] = new ArrayList<>(); - isBitmapEncoded = new boolean[numColumns]; + colCardinalities = new int[numColumns]; for (int colIdx = 0; colIdx < numColumns; colIdx++) { Map curColEncoder = encoder.get(colIdx); @@ -157,27 +149,43 @@ public int[][] encodeAttributesWithSupport(List columns, double minSup outlierList[colIdx].add(curKey); } } - if (useBitmaps && outlierList[colIdx].size() < cardinalityThreshold) { - isBitmapEncoded[colIdx] = true; + colCardinalities[colIdx] = outlierList[colIdx].size(); + if (!useBitmaps) + colCardinalities[colIdx] = cardinalityThreshold + 1; + } + log.info("Column cardinalities: {}", Arrays.toString(colCardinalities)); + // Encode the bitmaps. Store bitmaps as an array indexed first + // by column and then by outlier/inlier. Each entry in array + // is a map from encoded attribute value to the bitmap + // for that attribute among outliers or inliers. + for (int colIdx = 0; colIdx < numColumns; colIdx++) { + Map curColEncoder = encoder.get(colIdx); + String[] curCol = columns.get(colIdx); + if (useBitmaps && colCardinalities[colIdx] < cardinalityThreshold) { for (int rowIdx = 0; rowIdx < numRows; rowIdx++) { String colVal = curCol[rowIdx]; int oidx = (outlierColumn[rowIdx] > 0.0) ? 1 : 0; //1 = outlier, 0 = inlier int curKey = curColEncoder.get(colVal); if (curKey != noSupport) { if (bitmap[colIdx][oidx].containsKey(curKey)) { - bitmap[colIdx][oidx].get(curKey).add(rowIdx); + bitmap[colIdx][oidx].get(curKey).set(rowIdx); } else { - bitmap[colIdx][oidx].put(curKey, RoaringBitmap.bitmapOf(rowIdx)); + bitmap[colIdx][oidx].put(curKey, new ModBitSet()); + bitmap[colIdx][oidx].get(curKey).set(rowIdx); } } } } } - log.info("Bitmap-encoded columns: {}", Arrays.toString(isBitmapEncoded)); - return encodedAttributes; } + /** + * Encode as integers all attribute strings. + * @param columns A list of attribute strings from each column of the original + * dataset. + * @return A matrix of encoded attributes, stored as an array of arrays. + */ public int[][] encodeAttributesAsArray(List columns) { if (columns.isEmpty()) { return new int[0][0]; @@ -186,18 +194,18 @@ public int[][] encodeAttributesAsArray(List columns) { int numColumns = columns.size(); int numRows = columns.get(0).length; - // No columns are bitmap encoded, all are false. - isBitmapEncoded = new boolean[numColumns]; - for (int i = 0; i < numColumns; i++) { if (!encoder.containsKey(i)) { encoder.put(i, new HashMap<>()); } } + colCardinalities = new int[numColumns]; + for (int i = 0; i < numColumns; i++) + colCardinalities[i] = cardinalityThreshold + 1; + int[][] encodedAttributes = new int[numRows][numColumns]; - // noinspection Duplicates for (int colIdx = 0; colIdx < numColumns; colIdx++) { Map curColEncoder = encoder.get(colIdx); String[] curCol = columns.get(colIdx); @@ -217,44 +225,12 @@ public int[][] encodeAttributesAsArray(List columns) { return encodedAttributes; } - public int[][] encodeAttributesByColumn(List columns) { - if (columns.isEmpty()) { - log.info("numValuesEncoded: 0"); - return new int[0][0]; - } - - int numColumns = columns.size(); - int numRows = columns.get(0).length; - - for (int i = 0; i < numColumns; i++) { - if (!encoder.containsKey(i)) { - encoder.put(i, new HashMap<>()); - } - } - - int[][] encodedAttributes = new int[numColumns][numRows]; - log.info("numValuesEncoded: {}", numColumns*numRows); - - // noinspection Duplicates - for (int colIdx = 0; colIdx < numColumns; colIdx++) { - Map curColEncoder = encoder.get(colIdx); - String[] curCol = columns.get(colIdx); - for (int rowIdx = 0; rowIdx < numRows; rowIdx++) { - String colVal = curCol[rowIdx]; - if (!curColEncoder.containsKey(colVal)) { - curColEncoder.put(colVal, nextKey); - valueDecoder.put(nextKey, colVal); - columnDecoder.put(nextKey, colIdx); - nextKey++; - } - int curKey = curColEncoder.get(colVal); - encodedAttributes[colIdx][rowIdx] = curKey; - } - } - - return encodedAttributes; - } - + /** + * Encode as integers all attribute strings. + * @param columns A list of attribute strings from each column of the original + * dataset. + * @return A matrix of encoded attributes, stored as a list of arrays. + */ public List encodeAttributes(List columns) { if (columns.isEmpty()) { return new ArrayList<>(); diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/ModBitSet.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/ModBitSet.java new file mode 100644 index 000000000..3715eb93c --- /dev/null +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/summary/util/ModBitSet.java @@ -0,0 +1,1271 @@ +package edu.stanford.futuredata.macrobase.analysis.summary.util; + +import java.io.*; +import java.nio.ByteBuffer; +import java.nio.ByteOrder; +import java.nio.LongBuffer; +import java.util.*; +import java.util.stream.IntStream; +import java.util.stream.StreamSupport; + +/** + * This class is based on the Java BitSet implementation. We add one + * functions internally to the class, andCardinality. This function + * returns the cardinality of the AND of bitmaps, calculating it + * without materializing the bitmaps. That optimization is very + * important to Apriori performance. + */ + +/** + * This class implements a vector of bits that grows as needed. Each + * component of the bit set has a {@code boolean} value. The + * bits of a {@code ModBitSet} are indexed by nonnegative integers. + * Individual indexed bits can be examined, set, or cleared. One + * {@code ModBitSet} may be used to modify the contents of another + * {@code ModBitSet} through logical AND, logical inclusive OR, and + * logical exclusive OR operations. + * + *

By default, all bits in the set initially have the value + * {@code false}. + * + *

Every bit set has a current size, which is the number of bits + * of space currently in use by the bit set. Note that the size is + * related to the implementation of a bit set, so it may change with + * implementation. The length of a bit set relates to logical length + * of a bit set and is defined independently of implementation. + * + *

Unless otherwise noted, passing a null parameter to any of the + * methods in a {@code ModBitSet} will result in a + * {@code NullPointerException}. + * + *

A {@code ModBitSet} is not safe for multithreaded use without + * external synchronization. + * + * @author Arthur van Hoff + * @author Michael McCloskey + * @author Martin Buchholz + * @since JDK1.0 + */ +public class ModBitSet implements Cloneable, java.io.Serializable { + /* + * BitSets are packed into arrays of "words." Currently a word is + * a long, which consists of 64 bits, requiring 6 address bits. + * The choice of word size is determined purely by performance concerns. + */ + private final static int ADDRESS_BITS_PER_WORD = 6; + private final static int BITS_PER_WORD = 1 << ADDRESS_BITS_PER_WORD; + private final static int BIT_INDEX_MASK = BITS_PER_WORD - 1; + + /* Used to shift left or right for a partial word mask */ + private static final long WORD_MASK = 0xffffffffffffffffL; + + /** + * @serialField bits long[] + * + * The bits in this ModBitSet. The ith bit is stored in bits[i/64] at + * bit position i % 64 (where bit position 0 refers to the least + * significant bit and 63 refers to the most significant bit). + */ + private static final ObjectStreamField[] serialPersistentFields = { + new ObjectStreamField("bits", long[].class), + }; + + /** + * The internal field corresponding to the serialField "bits". + */ + private long[] words; + + /** + * The number of words in the logical size of this ModBitSet. + */ + private transient int wordsInUse = 0; + + /** + * Whether the size of "words" is user-specified. If so, we assume + * the user knows what he's doing and try harder to preserve it. + */ + private transient boolean sizeIsSticky = false; + + /* use serialVersionUID from JDK 1.0.2 for interoperability */ + private static final long serialVersionUID = 7997698588986878753L; + + /** + * Given a bit index, return word index containing it. + */ + private static int wordIndex(int bitIndex) { + return bitIndex >> ADDRESS_BITS_PER_WORD; + } + + /** + * Every public method must preserve these invariants. + */ + private void checkInvariants() { + assert(wordsInUse == 0 || words[wordsInUse - 1] != 0); + assert(wordsInUse >= 0 && wordsInUse <= words.length); + assert(wordsInUse == words.length || words[wordsInUse] == 0); + } + + /** + * Sets the field wordsInUse to the logical size in words of the bit set. + * WARNING:This method assumes that the number of words actually in use is + * less than or equal to the current value of wordsInUse! + */ + private void recalculateWordsInUse() { + // Traverse the bitset until a used word is found + int i; + for (i = wordsInUse-1; i >= 0; i--) + if (words[i] != 0) + break; + + wordsInUse = i+1; // The new logical size + } + + /** + * Creates a new bit set. All bits are initially {@code false}. + */ + public ModBitSet() { + initWords(BITS_PER_WORD); + sizeIsSticky = false; + } + + /** + * Creates a bit set whose initial size is large enough to explicitly + * represent bits with indices in the range {@code 0} through + * {@code nbits-1}. All bits are initially {@code false}. + * + * @param nbits the initial size of the bit set + * @throws NegativeArraySizeException if the specified initial size + * is negative + */ + public ModBitSet(int nbits) { + // nbits can't be negative; size 0 is OK + if (nbits < 0) + throw new NegativeArraySizeException("nbits < 0: " + nbits); + + initWords(nbits); + sizeIsSticky = true; + } + + private void initWords(int nbits) { + words = new long[wordIndex(nbits-1) + 1]; + } + + /** + * Creates a bit set using words as the internal representation. + * The last word (if there is one) must be non-zero. + */ + private ModBitSet(long[] words) { + this.words = words; + this.wordsInUse = words.length; + checkInvariants(); + } + + /** + * Returns a new bit set containing all the bits in the given long array. + * + *

More precisely, + *
{@code ModBitSet.valueOf(longs).get(n) == ((longs[n/64] & (1L<<(n%64))) != 0)} + *
for all {@code n < 64 * longs.length}. + * + *

This method is equivalent to + * {@code ModBitSet.valueOf(LongBuffer.wrap(longs))}. + * + * @param longs a long array containing a little-endian representation + * of a sequence of bits to be used as the initial bits of the + * new bit set + * @return a {@code ModBitSet} containing all the bits in the long array + * @since 1.7 + */ + public static ModBitSet valueOf(long[] longs) { + int n; + for (n = longs.length; n > 0 && longs[n - 1] == 0; n--) + ; + return new ModBitSet(Arrays.copyOf(longs, n)); + } + + /** + * Returns a new bit set containing all the bits in the given long + * buffer between its position and limit. + * + *

More precisely, + *
{@code ModBitSet.valueOf(lb).get(n) == ((lb.get(lb.position()+n/64) & (1L<<(n%64))) != 0)} + *
for all {@code n < 64 * lb.remaining()}. + * + *

The long buffer is not modified by this method, and no + * reference to the buffer is retained by the bit set. + * + * @param lb a long buffer containing a little-endian representation + * of a sequence of bits between its position and limit, to be + * used as the initial bits of the new bit set + * @return a {@code ModBitSet} containing all the bits in the buffer in the + * specified range + * @since 1.7 + */ + public static ModBitSet valueOf(LongBuffer lb) { + lb = lb.slice(); + int n; + for (n = lb.remaining(); n > 0 && lb.get(n - 1) == 0; n--) + ; + long[] words = new long[n]; + lb.get(words); + return new ModBitSet(words); + } + + /** + * Returns a new bit set containing all the bits in the given byte array. + * + *

More precisely, + *
{@code ModBitSet.valueOf(bytes).get(n) == ((bytes[n/8] & (1<<(n%8))) != 0)} + *
for all {@code n < 8 * bytes.length}. + * + *

This method is equivalent to + * {@code ModBitSet.valueOf(ByteBuffer.wrap(bytes))}. + * + * @param bytes a byte array containing a little-endian + * representation of a sequence of bits to be used as the + * initial bits of the new bit set + * @return a {@code ModBitSet} containing all the bits in the byte array + * @since 1.7 + */ + public static ModBitSet valueOf(byte[] bytes) { + return ModBitSet.valueOf(ByteBuffer.wrap(bytes)); + } + + /** + * Returns a new bit set containing all the bits in the given byte + * buffer between its position and limit. + * + *

More precisely, + *
{@code ModBitSet.valueOf(bb).get(n) == ((bb.get(bb.position()+n/8) & (1<<(n%8))) != 0)} + *
for all {@code n < 8 * bb.remaining()}. + * + *

The byte buffer is not modified by this method, and no + * reference to the buffer is retained by the bit set. + * + * @param bb a byte buffer containing a little-endian representation + * of a sequence of bits between its position and limit, to be + * used as the initial bits of the new bit set + * @return a {@code ModBitSet} containing all the bits in the buffer in the + * specified range + * @since 1.7 + */ + public static ModBitSet valueOf(ByteBuffer bb) { + bb = bb.slice().order(ByteOrder.LITTLE_ENDIAN); + int n; + for (n = bb.remaining(); n > 0 && bb.get(n - 1) == 0; n--) + ; + long[] words = new long[(n + 7) / 8]; + bb.limit(n); + int i = 0; + while (bb.remaining() >= 8) + words[i++] = bb.getLong(); + for (int remaining = bb.remaining(), j = 0; j < remaining; j++) + words[i] |= (bb.get() & 0xffL) << (8 * j); + return new ModBitSet(words); + } + + /** + * Returns a new byte array containing all the bits in this bit set. + * + *

More precisely, if + *
{@code byte[] bytes = s.toByteArray();} + *
then {@code bytes.length == (s.length()+7)/8} and + *
{@code s.get(n) == ((bytes[n/8] & (1<<(n%8))) != 0)} + *
for all {@code n < 8 * bytes.length}. + * + * @return a byte array containing a little-endian representation + * of all the bits in this bit set + * @since 1.7 + */ + public byte[] toByteArray() { + int n = wordsInUse; + if (n == 0) + return new byte[0]; + int len = 8 * (n-1); + for (long x = words[n - 1]; x != 0; x >>>= 8) + len++; + byte[] bytes = new byte[len]; + ByteBuffer bb = ByteBuffer.wrap(bytes).order(ByteOrder.LITTLE_ENDIAN); + for (int i = 0; i < n - 1; i++) + bb.putLong(words[i]); + for (long x = words[n - 1]; x != 0; x >>>= 8) + bb.put((byte) (x & 0xff)); + return bytes; + } + + /** + * Returns a new long array containing all the bits in this bit set. + * + *

More precisely, if + *
{@code long[] longs = s.toLongArray();} + *
then {@code longs.length == (s.length()+63)/64} and + *
{@code s.get(n) == ((longs[n/64] & (1L<<(n%64))) != 0)} + *
for all {@code n < 64 * longs.length}. + * + * @return a long array containing a little-endian representation + * of all the bits in this bit set + * @since 1.7 + */ + public long[] toLongArray() { + return Arrays.copyOf(words, wordsInUse); + } + + /** + * Ensures that the ModBitSet can hold enough words. + * @param wordsRequired the minimum acceptable number of words. + */ + private void ensureCapacity(int wordsRequired) { + if (words.length < wordsRequired) { + // Allocate larger of doubled size or required size + int request = Math.max(2 * words.length, wordsRequired); + words = Arrays.copyOf(words, request); + sizeIsSticky = false; + } + } + + /** + * Ensures that the ModBitSet can accommodate a given wordIndex, + * temporarily violating the invariants. The caller must + * restore the invariants before returning to the user, + * possibly using recalculateWordsInUse(). + * @param wordIndex the index to be accommodated. + */ + private void expandTo(int wordIndex) { + int wordsRequired = wordIndex+1; + if (wordsInUse < wordsRequired) { + ensureCapacity(wordsRequired); + wordsInUse = wordsRequired; + } + } + + /** + * Checks that fromIndex ... toIndex is a valid range of bit indices. + */ + private static void checkRange(int fromIndex, int toIndex) { + if (fromIndex < 0) + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex); + if (toIndex < 0) + throw new IndexOutOfBoundsException("toIndex < 0: " + toIndex); + if (fromIndex > toIndex) + throw new IndexOutOfBoundsException("fromIndex: " + fromIndex + + " > toIndex: " + toIndex); + } + + /** + * Sets the bit at the specified index to the complement of its + * current value. + * + * @param bitIndex the index of the bit to flip + * @throws IndexOutOfBoundsException if the specified index is negative + * @since 1.4 + */ + public void flip(int bitIndex) { + if (bitIndex < 0) + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex); + + int wordIndex = wordIndex(bitIndex); + expandTo(wordIndex); + + words[wordIndex] ^= (1L << bitIndex); + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Sets each bit from the specified {@code fromIndex} (inclusive) to the + * specified {@code toIndex} (exclusive) to the complement of its current + * value. + * + * @param fromIndex index of the first bit to flip + * @param toIndex index after the last bit to flip + * @throws IndexOutOfBoundsException if {@code fromIndex} is negative, + * or {@code toIndex} is negative, or {@code fromIndex} is + * larger than {@code toIndex} + * @since 1.4 + */ + public void flip(int fromIndex, int toIndex) { + checkRange(fromIndex, toIndex); + + if (fromIndex == toIndex) + return; + + int startWordIndex = wordIndex(fromIndex); + int endWordIndex = wordIndex(toIndex - 1); + expandTo(endWordIndex); + + long firstWordMask = WORD_MASK << fromIndex; + long lastWordMask = WORD_MASK >>> -toIndex; + if (startWordIndex == endWordIndex) { + // Case 1: One word + words[startWordIndex] ^= (firstWordMask & lastWordMask); + } else { + // Case 2: Multiple words + // Handle first word + words[startWordIndex] ^= firstWordMask; + + // Handle intermediate words, if any + for (int i = startWordIndex+1; i < endWordIndex; i++) + words[i] ^= WORD_MASK; + + // Handle last word + words[endWordIndex] ^= lastWordMask; + } + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Sets the bit at the specified index to {@code true}. + * + * @param bitIndex a bit index + * @throws IndexOutOfBoundsException if the specified index is negative + * @since JDK1.0 + */ + public void set(int bitIndex) { + if (bitIndex < 0) + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex); + + int wordIndex = wordIndex(bitIndex); + expandTo(wordIndex); + + words[wordIndex] |= (1L << bitIndex); // Restores invariants + + checkInvariants(); + } + + /** + * Sets the bit at the specified index to the specified value. + * + * @param bitIndex a bit index + * @param value a boolean value to set + * @throws IndexOutOfBoundsException if the specified index is negative + * @since 1.4 + */ + public void set(int bitIndex, boolean value) { + if (value) + set(bitIndex); + else + clear(bitIndex); + } + + /** + * Sets the bits from the specified {@code fromIndex} (inclusive) to the + * specified {@code toIndex} (exclusive) to {@code true}. + * + * @param fromIndex index of the first bit to be set + * @param toIndex index after the last bit to be set + * @throws IndexOutOfBoundsException if {@code fromIndex} is negative, + * or {@code toIndex} is negative, or {@code fromIndex} is + * larger than {@code toIndex} + * @since 1.4 + */ + public void set(int fromIndex, int toIndex) { + checkRange(fromIndex, toIndex); + + if (fromIndex == toIndex) + return; + + // Increase capacity if necessary + int startWordIndex = wordIndex(fromIndex); + int endWordIndex = wordIndex(toIndex - 1); + expandTo(endWordIndex); + + long firstWordMask = WORD_MASK << fromIndex; + long lastWordMask = WORD_MASK >>> -toIndex; + if (startWordIndex == endWordIndex) { + // Case 1: One word + words[startWordIndex] |= (firstWordMask & lastWordMask); + } else { + // Case 2: Multiple words + // Handle first word + words[startWordIndex] |= firstWordMask; + + // Handle intermediate words, if any + for (int i = startWordIndex+1; i < endWordIndex; i++) + words[i] = WORD_MASK; + + // Handle last word (restores invariants) + words[endWordIndex] |= lastWordMask; + } + + checkInvariants(); + } + + /** + * Sets the bits from the specified {@code fromIndex} (inclusive) to the + * specified {@code toIndex} (exclusive) to the specified value. + * + * @param fromIndex index of the first bit to be set + * @param toIndex index after the last bit to be set + * @param value value to set the selected bits to + * @throws IndexOutOfBoundsException if {@code fromIndex} is negative, + * or {@code toIndex} is negative, or {@code fromIndex} is + * larger than {@code toIndex} + * @since 1.4 + */ + public void set(int fromIndex, int toIndex, boolean value) { + if (value) + set(fromIndex, toIndex); + else + clear(fromIndex, toIndex); + } + + /** + * Sets the bit specified by the index to {@code false}. + * + * @param bitIndex the index of the bit to be cleared + * @throws IndexOutOfBoundsException if the specified index is negative + * @since JDK1.0 + */ + public void clear(int bitIndex) { + if (bitIndex < 0) + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex); + + int wordIndex = wordIndex(bitIndex); + if (wordIndex >= wordsInUse) + return; + + words[wordIndex] &= ~(1L << bitIndex); + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Sets the bits from the specified {@code fromIndex} (inclusive) to the + * specified {@code toIndex} (exclusive) to {@code false}. + * + * @param fromIndex index of the first bit to be cleared + * @param toIndex index after the last bit to be cleared + * @throws IndexOutOfBoundsException if {@code fromIndex} is negative, + * or {@code toIndex} is negative, or {@code fromIndex} is + * larger than {@code toIndex} + * @since 1.4 + */ + public void clear(int fromIndex, int toIndex) { + checkRange(fromIndex, toIndex); + + if (fromIndex == toIndex) + return; + + int startWordIndex = wordIndex(fromIndex); + if (startWordIndex >= wordsInUse) + return; + + int endWordIndex = wordIndex(toIndex - 1); + if (endWordIndex >= wordsInUse) { + toIndex = length(); + endWordIndex = wordsInUse - 1; + } + + long firstWordMask = WORD_MASK << fromIndex; + long lastWordMask = WORD_MASK >>> -toIndex; + if (startWordIndex == endWordIndex) { + // Case 1: One word + words[startWordIndex] &= ~(firstWordMask & lastWordMask); + } else { + // Case 2: Multiple words + // Handle first word + words[startWordIndex] &= ~firstWordMask; + + // Handle intermediate words, if any + for (int i = startWordIndex+1; i < endWordIndex; i++) + words[i] = 0; + + // Handle last word + words[endWordIndex] &= ~lastWordMask; + } + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Sets all of the bits in this ModBitSet to {@code false}. + * + * @since 1.4 + */ + public void clear() { + while (wordsInUse > 0) + words[--wordsInUse] = 0; + } + + /** + * Returns the value of the bit with the specified index. The value + * is {@code true} if the bit with the index {@code bitIndex} + * is currently set in this {@code ModBitSet}; otherwise, the result + * is {@code false}. + * + * @param bitIndex the bit index + * @return the value of the bit with the specified index + * @throws IndexOutOfBoundsException if the specified index is negative + */ + public boolean get(int bitIndex) { + if (bitIndex < 0) + throw new IndexOutOfBoundsException("bitIndex < 0: " + bitIndex); + + checkInvariants(); + + int wordIndex = wordIndex(bitIndex); + return (wordIndex < wordsInUse) + && ((words[wordIndex] & (1L << bitIndex)) != 0); + } + + /** + * Returns a new {@code ModBitSet} composed of bits from this {@code ModBitSet} + * from {@code fromIndex} (inclusive) to {@code toIndex} (exclusive). + * + * @param fromIndex index of the first bit to include + * @param toIndex index after the last bit to include + * @return a new {@code ModBitSet} from a range of this {@code ModBitSet} + * @throws IndexOutOfBoundsException if {@code fromIndex} is negative, + * or {@code toIndex} is negative, or {@code fromIndex} is + * larger than {@code toIndex} + * @since 1.4 + */ + public ModBitSet get(int fromIndex, int toIndex) { + checkRange(fromIndex, toIndex); + + checkInvariants(); + + int len = length(); + + // If no set bits in range return empty bitset + if (len <= fromIndex || fromIndex == toIndex) + return new ModBitSet(0); + + // An optimization + if (toIndex > len) + toIndex = len; + + ModBitSet result = new ModBitSet(toIndex - fromIndex); + int targetWords = wordIndex(toIndex - fromIndex - 1) + 1; + int sourceIndex = wordIndex(fromIndex); + boolean wordAligned = ((fromIndex & BIT_INDEX_MASK) == 0); + + // Process all words but the last word + for (int i = 0; i < targetWords - 1; i++, sourceIndex++) + result.words[i] = wordAligned ? words[sourceIndex] : + (words[sourceIndex] >>> fromIndex) | + (words[sourceIndex+1] << -fromIndex); + + // Process the last word + long lastWordMask = WORD_MASK >>> -toIndex; + result.words[targetWords - 1] = + ((toIndex-1) & BIT_INDEX_MASK) < (fromIndex & BIT_INDEX_MASK) + ? /* straddles source words */ + ((words[sourceIndex] >>> fromIndex) | + (words[sourceIndex+1] & lastWordMask) << -fromIndex) + : + ((words[sourceIndex] & lastWordMask) >>> fromIndex); + + // Set wordsInUse correctly + result.wordsInUse = targetWords; + result.recalculateWordsInUse(); + result.checkInvariants(); + + return result; + } + + /** + * Returns the index of the first bit that is set to {@code true} + * that occurs on or after the specified starting index. If no such + * bit exists then {@code -1} is returned. + * + *

To iterate over the {@code true} bits in a {@code ModBitSet}, + * use the following loop: + * + *

 {@code
+     * for (int i = bs.nextSetBit(0); i >= 0; i = bs.nextSetBit(i+1)) {
+     *     // operate on index i here
+     *     if (i == Integer.MAX_VALUE) {
+     *         break; // or (i+1) would overflow
+     *     }
+     * }}
+ * + * @param fromIndex the index to start checking from (inclusive) + * @return the index of the next set bit, or {@code -1} if there + * is no such bit + * @throws IndexOutOfBoundsException if the specified index is negative + * @since 1.4 + */ + public int nextSetBit(int fromIndex) { + if (fromIndex < 0) + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex); + + checkInvariants(); + + int u = wordIndex(fromIndex); + if (u >= wordsInUse) + return -1; + + long word = words[u] & (WORD_MASK << fromIndex); + + while (true) { + if (word != 0) + return (u * BITS_PER_WORD) + Long.numberOfTrailingZeros(word); + if (++u == wordsInUse) + return -1; + word = words[u]; + } + } + + /** + * Returns the index of the first bit that is set to {@code false} + * that occurs on or after the specified starting index. + * + * @param fromIndex the index to start checking from (inclusive) + * @return the index of the next clear bit + * @throws IndexOutOfBoundsException if the specified index is negative + * @since 1.4 + */ + public int nextClearBit(int fromIndex) { + // Neither spec nor implementation handle bitsets of maximal length. + // See 4816253. + if (fromIndex < 0) + throw new IndexOutOfBoundsException("fromIndex < 0: " + fromIndex); + + checkInvariants(); + + int u = wordIndex(fromIndex); + if (u >= wordsInUse) + return fromIndex; + + long word = ~words[u] & (WORD_MASK << fromIndex); + + while (true) { + if (word != 0) + return (u * BITS_PER_WORD) + Long.numberOfTrailingZeros(word); + if (++u == wordsInUse) + return wordsInUse * BITS_PER_WORD; + word = ~words[u]; + } + } + + /** + * Returns the index of the nearest bit that is set to {@code true} + * that occurs on or before the specified starting index. + * If no such bit exists, or if {@code -1} is given as the + * starting index, then {@code -1} is returned. + * + *

To iterate over the {@code true} bits in a {@code ModBitSet}, + * use the following loop: + * + *

 {@code
+     * for (int i = bs.length(); (i = bs.previousSetBit(i-1)) >= 0; ) {
+     *     // operate on index i here
+     * }}
+ * + * @param fromIndex the index to start checking from (inclusive) + * @return the index of the previous set bit, or {@code -1} if there + * is no such bit + * @throws IndexOutOfBoundsException if the specified index is less + * than {@code -1} + * @since 1.7 + */ + public int previousSetBit(int fromIndex) { + if (fromIndex < 0) { + if (fromIndex == -1) + return -1; + throw new IndexOutOfBoundsException( + "fromIndex < -1: " + fromIndex); + } + + checkInvariants(); + + int u = wordIndex(fromIndex); + if (u >= wordsInUse) + return length() - 1; + + long word = words[u] & (WORD_MASK >>> -(fromIndex+1)); + + while (true) { + if (word != 0) + return (u+1) * BITS_PER_WORD - 1 - Long.numberOfLeadingZeros(word); + if (u-- == 0) + return -1; + word = words[u]; + } + } + + /** + * Returns the index of the nearest bit that is set to {@code false} + * that occurs on or before the specified starting index. + * If no such bit exists, or if {@code -1} is given as the + * starting index, then {@code -1} is returned. + * + * @param fromIndex the index to start checking from (inclusive) + * @return the index of the previous clear bit, or {@code -1} if there + * is no such bit + * @throws IndexOutOfBoundsException if the specified index is less + * than {@code -1} + * @since 1.7 + */ + public int previousClearBit(int fromIndex) { + if (fromIndex < 0) { + if (fromIndex == -1) + return -1; + throw new IndexOutOfBoundsException( + "fromIndex < -1: " + fromIndex); + } + + checkInvariants(); + + int u = wordIndex(fromIndex); + if (u >= wordsInUse) + return fromIndex; + + long word = ~words[u] & (WORD_MASK >>> -(fromIndex+1)); + + while (true) { + if (word != 0) + return (u+1) * BITS_PER_WORD -1 - Long.numberOfLeadingZeros(word); + if (u-- == 0) + return -1; + word = ~words[u]; + } + } + + /** + * Returns the "logical size" of this {@code ModBitSet}: the index of + * the highest set bit in the {@code ModBitSet} plus one. Returns zero + * if the {@code ModBitSet} contains no set bits. + * + * @return the logical size of this {@code ModBitSet} + * @since 1.2 + */ + public int length() { + if (wordsInUse == 0) + return 0; + + return BITS_PER_WORD * (wordsInUse - 1) + + (BITS_PER_WORD - Long.numberOfLeadingZeros(words[wordsInUse - 1])); + } + + /** + * Returns true if this {@code ModBitSet} contains no bits that are set + * to {@code true}. + * + * @return boolean indicating whether this {@code ModBitSet} is empty + * @since 1.4 + */ + public boolean isEmpty() { + return wordsInUse == 0; + } + + /** + * Returns true if the specified {@code ModBitSet} has any bits set to + * {@code true} that are also set to {@code true} in this {@code ModBitSet}. + * + * @param set {@code ModBitSet} to intersect with + * @return boolean indicating whether this {@code ModBitSet} intersects + * the specified {@code ModBitSet} + * @since 1.4 + */ + public boolean intersects(ModBitSet set) { + for (int i = Math.min(wordsInUse, set.wordsInUse) - 1; i >= 0; i--) + if ((words[i] & set.words[i]) != 0) + return true; + return false; + } + + /** + * Returns the number of bits set to {@code true} in this {@code ModBitSet}. + * + * @return the number of bits set to {@code true} in this {@code ModBitSet} + * @since 1.4 + */ + public int cardinality() { + int sum = 0; + for (int i = 0; i < wordsInUse; i++) + sum += Long.bitCount(words[i]); + return sum; + } + + /** + * Performs a logical AND of this target bit set with the + * argument bit set. This bit set is modified so that each bit in it + * has the value {@code true} if and only if it both initially + * had the value {@code true} and the corresponding bit in the + * bit set argument also had the value {@code true}. + * + * @param set a bit set + */ + public void and(ModBitSet set) { + if (this == set) + return; + + while (wordsInUse > set.wordsInUse) + words[--wordsInUse] = 0; + + // Perform logical AND on words in common + for (int i = 0; i < wordsInUse; i++) + words[i] &= set.words[i]; + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Return the cardinality of the logical AND of two bitmaps without materializing + * the AND. + * @param setOne A bit set. + * @param setTwo Another bit set. + * @return The cardinality of setOne & setTwo. + */ + public static int andCardinality(ModBitSet setOne, ModBitSet setTwo) { + int cardinality = 0; + long curWord; + int minWordsInUse = Math.min(setOne.wordsInUse, setTwo.wordsInUse); + // Perform logical AND on words in common + for (int i = 0; i < minWordsInUse; i++) { + curWord = setOne.words[i] & setTwo.words[i]; + cardinality += Long.bitCount(curWord); + } + return cardinality; + } + /** + * Return the cardinality of the logical AND of three bitmaps without materializing + * the AND. + * @param setOne A bit set. + * @param setTwo Another bit set. + * @param setThree A third bit set. + * @return The cardinality of setOne & setTwo & setThree. + */ + public static int andCardinality(ModBitSet setOne, ModBitSet setTwo, ModBitSet setThree) { + int cardinality = 0; + long curWord; + int minWordsInUse = Math.min(setOne.wordsInUse, setTwo.wordsInUse); + minWordsInUse = Math.min(minWordsInUse, setThree.wordsInUse); + // Perform logical AND on words in common + for (int i = 0; i < minWordsInUse; i++) { + curWord = setOne.words[i] & setTwo.words[i] & setThree.words[i]; + cardinality += Long.bitCount(curWord); + } + return cardinality; + } + + /** + * Performs a logical OR of this bit set with the bit set + * argument. This bit set is modified so that a bit in it has the + * value {@code true} if and only if it either already had the + * value {@code true} or the corresponding bit in the bit set + * argument has the value {@code true}. + * + * @param set a bit set + */ + public void or(ModBitSet set) { + if (this == set) + return; + + int wordsInCommon = Math.min(wordsInUse, set.wordsInUse); + + if (wordsInUse < set.wordsInUse) { + ensureCapacity(set.wordsInUse); + wordsInUse = set.wordsInUse; + } + + // Perform logical OR on words in common + for (int i = 0; i < wordsInCommon; i++) + words[i] |= set.words[i]; + + // Copy any remaining words + if (wordsInCommon < set.wordsInUse) + System.arraycopy(set.words, wordsInCommon, + words, wordsInCommon, + wordsInUse - wordsInCommon); + + // recalculateWordsInUse() is unnecessary + checkInvariants(); + } + + /** + * Performs a logical XOR of this bit set with the bit set + * argument. This bit set is modified so that a bit in it has the + * value {@code true} if and only if one of the following + * statements holds: + *
    + *
  • The bit initially has the value {@code true}, and the + * corresponding bit in the argument has the value {@code false}. + *
  • The bit initially has the value {@code false}, and the + * corresponding bit in the argument has the value {@code true}. + *
+ * + * @param set a bit set + */ + public void xor(ModBitSet set) { + int wordsInCommon = Math.min(wordsInUse, set.wordsInUse); + + if (wordsInUse < set.wordsInUse) { + ensureCapacity(set.wordsInUse); + wordsInUse = set.wordsInUse; + } + + // Perform logical XOR on words in common + for (int i = 0; i < wordsInCommon; i++) + words[i] ^= set.words[i]; + + // Copy any remaining words + if (wordsInCommon < set.wordsInUse) + System.arraycopy(set.words, wordsInCommon, + words, wordsInCommon, + set.wordsInUse - wordsInCommon); + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Clears all of the bits in this {@code ModBitSet} whose corresponding + * bit is set in the specified {@code ModBitSet}. + * + * @param set the {@code ModBitSet} with which to mask this + * {@code ModBitSet} + * @since 1.2 + */ + public void andNot(ModBitSet set) { + // Perform logical (a & !b) on words in common + for (int i = Math.min(wordsInUse, set.wordsInUse) - 1; i >= 0; i--) + words[i] &= ~set.words[i]; + + recalculateWordsInUse(); + checkInvariants(); + } + + /** + * Returns the hash code value for this bit set. The hash code depends + * only on which bits are set within this {@code ModBitSet}. + * + *

The hash code is defined to be the result of the following + * calculation: + *

 {@code
+     * public int hashCode() {
+     *     long h = 1234;
+     *     long[] words = toLongArray();
+     *     for (int i = words.length; --i >= 0; )
+     *         h ^= words[i] * (i + 1);
+     *     return (int)((h >> 32) ^ h);
+     * }}
+ * Note that the hash code changes if the set of bits is altered. + * + * @return the hash code value for this bit set + */ + public int hashCode() { + long h = 1234; + for (int i = wordsInUse; --i >= 0; ) + h ^= words[i] * (i + 1); + + return (int)((h >> 32) ^ h); + } + + /** + * Returns the number of bits of space actually in use by this + * {@code ModBitSet} to represent bit values. + * The maximum element in the set is the size - 1st element. + * + * @return the number of bits currently in this bit set + */ + public int size() { + return words.length * BITS_PER_WORD; + } + + /** + * Compares this object against the specified object. + * The result is {@code true} if and only if the argument is + * not {@code null} and is a {@code Bitset} object that has + * exactly the same set of bits set to {@code true} as this bit + * set. That is, for every nonnegative {@code int} index {@code k}, + *
((ModBitSet)obj).get(k) == this.get(k)
+ * must be true. The current sizes of the two bit sets are not compared. + * + * @param obj the object to compare with + * @return {@code true} if the objects are the same; + * {@code false} otherwise + * @see #size() + */ + public boolean equals(Object obj) { + if (!(obj instanceof ModBitSet)) + return false; + if (this == obj) + return true; + + ModBitSet set = (ModBitSet) obj; + + checkInvariants(); + set.checkInvariants(); + + if (wordsInUse != set.wordsInUse) + return false; + + // Check words in use by both BitSets + for (int i = 0; i < wordsInUse; i++) + if (words[i] != set.words[i]) + return false; + + return true; + } + + /** + * Cloning this {@code ModBitSet} produces a new {@code ModBitSet} + * that is equal to it. + * The clone of the bit set is another bit set that has exactly the + * same bits set to {@code true} as this bit set. + * + * @return a clone of this bit set + * @see #size() + */ + public Object clone() { + if (! sizeIsSticky) + trimToSize(); + + try { + ModBitSet result = (ModBitSet) super.clone(); + result.words = words.clone(); + result.checkInvariants(); + return result; + } catch (CloneNotSupportedException e) { + throw new InternalError(e); + } + } + + /** + * Attempts to reduce internal storage used for the bits in this bit set. + * Calling this method may, but is not required to, affect the value + * returned by a subsequent call to the {@link #size()} method. + */ + private void trimToSize() { + if (wordsInUse != words.length) { + words = Arrays.copyOf(words, wordsInUse); + checkInvariants(); + } + } + + /** + * Save the state of the {@code ModBitSet} instance to a stream (i.e., + * serialize it). + */ + private void writeObject(ObjectOutputStream s) + throws IOException { + + checkInvariants(); + + if (! sizeIsSticky) + trimToSize(); + + ObjectOutputStream.PutField fields = s.putFields(); + fields.put("bits", words); + s.writeFields(); + } + + /** + * Reconstitute the {@code ModBitSet} instance from a stream (i.e., + * deserialize it). + */ + private void readObject(ObjectInputStream s) + throws IOException, ClassNotFoundException { + + ObjectInputStream.GetField fields = s.readFields(); + words = (long[]) fields.get("bits", null); + + // Assume maximum length then find real length + // because recalculateWordsInUse assumes maintenance + // or reduction in logical size + wordsInUse = words.length; + recalculateWordsInUse(); + sizeIsSticky = (words.length > 0 && words[words.length-1] == 0L); // heuristic + checkInvariants(); + } + + /** + * Returns a string representation of this bit set. For every index + * for which this {@code ModBitSet} contains a bit in the set + * state, the decimal representation of that index is included in + * the result. Such indices are listed in order from lowest to + * highest, separated by ", " (a comma and a space) and + * surrounded by braces, resulting in the usual mathematical + * notation for a set of integers. + * + *

Example: + *

+     * ModBitSet drPepper = new ModBitSet();
+ * Now {@code drPepper.toString()} returns "{@code {}}". + *
+     * drPepper.set(2);
+ * Now {@code drPepper.toString()} returns "{@code {2}}". + *
+     * drPepper.set(4);
+     * drPepper.set(10);
+ * Now {@code drPepper.toString()} returns "{@code {2, 4, 10}}". + * + * @return a string representation of this bit set + */ + public String toString() { + checkInvariants(); + + int numBits = (wordsInUse > 128) ? + cardinality() : wordsInUse * BITS_PER_WORD; + StringBuilder b = new StringBuilder(6*numBits + 2); + b.append('{'); + + int i = nextSetBit(0); + if (i != -1) { + b.append(i); + while (true) { + if (++i < 0) break; + if ((i = nextSetBit(i)) < 0) break; + int endOfRun = nextClearBit(i); + do { b.append(", ").append(i); } + while (++i != endOfRun); + } + } + + b.append('}'); + return b.toString(); + } + + /** + * Returns a stream of indices for which this {@code ModBitSet} + * contains a bit in the set state. The indices are returned + * in order, from lowest to highest. The size of the stream + * is the number of bits in the set state, equal to the value + * returned by the {@link #cardinality()} method. + * + *

The bit set must remain constant during the execution of the + * terminal stream operation. Otherwise, the result of the terminal + * stream operation is undefined. + * + * @return a stream of integers representing set indices + * @since 1.8 + */ + public IntStream stream() { + class BitSetIterator implements PrimitiveIterator.OfInt { + int next = nextSetBit(0); + + @Override + public boolean hasNext() { + return next != -1; + } + + @Override + public int nextInt() { + if (next != -1) { + int ret = next; + next = nextSetBit(next+1); + return ret; + } else { + throw new NoSuchElementException(); + } + } + } + + return StreamSupport.intStream( + () -> Spliterators.spliterator( + new BitSetIterator(), cardinality(), + Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.SORTED), + Spliterator.SIZED | Spliterator.SUBSIZED | + Spliterator.ORDERED | Spliterator.DISTINCT | Spliterator.SORTED, + false); + } +} diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java index 6e4cc471c..6e42b6ebc 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java @@ -6,16 +6,11 @@ import static java.util.stream.Collectors.toList; import com.google.common.base.Joiner; +import edu.stanford.futuredata.macrobase.analysis.summary.util.ModBitSet; import edu.stanford.futuredata.macrobase.datamodel.Schema.ColType; import edu.stanford.futuredata.macrobase.util.MacroBaseInternalError; import java.io.PrintStream; -import java.util.ArrayList; -import java.util.BitSet; -import java.util.Collections; -import java.util.Comparator; -import java.util.Iterator; -import java.util.List; -import java.util.Objects; +import java.util.*; import java.util.function.DoublePredicate; import java.util.function.Predicate; import java.util.stream.IntStream; @@ -494,7 +489,7 @@ public DataFrame project(String projectionCol) { * @param mask rows to select * @return new DataFrame with subset of rows */ - public DataFrame filter(BitSet mask) { + public DataFrame filter(ModBitSet mask) { DataFrame other = new DataFrame(); int d = schema.getNumColumns(); @@ -538,7 +533,7 @@ public DataFrame filter(BitSet mask) { public DataFrame filter(int columnIdx, Predicate filter) { String[] filterColumn = getStringColumn(columnIdx); - final BitSet mask = new BitSet(numRows); + final ModBitSet mask = new ModBitSet(numRows); for (int i = 0; i < numRows; i++) { mask.set(i, filter.test(filterColumn[i])); } @@ -554,19 +549,19 @@ public DataFrame filter(String columnName, Predicate filter) { * @return new DataFrame with subset of rows */ public DataFrame filter(int columnIdx, DoublePredicate filter) { - final BitSet mask = getMaskForFilter(columnIdx, filter); + final ModBitSet mask = getMaskForFilter(columnIdx, filter); return filter(mask); } /** * @param columnIdx column index to filter by * @param filter Predicate to test each column value - * @return a BitSet that encodes the true/false value generated by the filter + * @return a ModBitSet that encodes the true/false value generated by the filter * on each row in the DataFrame */ - public BitSet getMaskForFilter(int columnIdx, Predicate filter) { + public ModBitSet getMaskForFilter(int columnIdx, Predicate filter) { String[] filterColumn = getStringColumn(columnIdx); - final BitSet mask = new BitSet(numRows); + final ModBitSet mask = new ModBitSet(numRows); for (int i = 0; i < numRows; i++) { mask.set(i, filter.test(filterColumn[i])); } @@ -576,12 +571,12 @@ public BitSet getMaskForFilter(int columnIdx, Predicate filter) { /** * @param columnIdx column index to filter by * @param filter DoublePredicate to test each column value - * @return a BitSet that encodes the true/false value generated by the filter + * @return a ModBitSet that encodes the true/false value generated by the filter * on each row in the DataFrame */ - public BitSet getMaskForFilter(int columnIdx, DoublePredicate filter) { + public ModBitSet getMaskForFilter(int columnIdx, DoublePredicate filter) { double[] filterColumn = getDoubleColumn(columnIdx); - final BitSet mask = new BitSet(numRows); + final ModBitSet mask = new ModBitSet(numRows); for (int i = 0; i < numRows; i++) { mask.set(i, filter.test(filterColumn[i])); } diff --git a/lib/testRunner.sh b/lib/testRunner.sh old mode 100755 new mode 100644 diff --git a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java index e2605ebdb..748c5b54c 100644 --- a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java +++ b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java @@ -12,6 +12,7 @@ import edu.stanford.futuredata.macrobase.analysis.MBFunction; import edu.stanford.futuredata.macrobase.analysis.summary.aplinear.APLOutlierSummarizer; import edu.stanford.futuredata.macrobase.analysis.summary.util.AttributeEncoder; +import edu.stanford.futuredata.macrobase.analysis.summary.util.ModBitSet; import edu.stanford.futuredata.macrobase.datamodel.DataFrame; import edu.stanford.futuredata.macrobase.datamodel.Row; import edu.stanford.futuredata.macrobase.datamodel.Schema; @@ -55,7 +56,7 @@ import edu.stanford.futuredata.macrobase.util.MacroBaseSQLException; import java.util.ArrayList; import java.util.Arrays; -import java.util.BitSet; +import edu.stanford.futuredata.macrobase.analysis.summary.util.ModBitSet; import java.util.HashMap; import java.util.HashSet; import java.util.LinkedList; @@ -204,7 +205,7 @@ private DataFrame executeDiffQuerySpec(final DiffQuerySpecification diffQuery) dfToExplain = getDataFrameForRelation(relationToExplain); // add outlier (binary) column by evaluating the WHERE clause - final BitSet mask = getMask(dfToExplain, splitQuery.getWhereClause()); + final ModBitSet mask = getMask(dfToExplain, splitQuery.getWhereClause()); final double[] outlierVals = new double[dfToExplain.getNumRows()]; mask.stream().forEach((i) -> outlierVals[i] = 1.0); dfToExplain.addColumn(outlierColName, outlierVals); @@ -1088,30 +1089,30 @@ private DataFrame evaluateWhereClause(final DataFrame df, return df; } final Expression whereClause = whereClauseOpt.get(); - final BitSet mask = getMask(df, whereClause); + final ModBitSet mask = getMask(df, whereClause); return df.filter(mask); } // ********************* Helper methods for evaluating Where clauses ********************** /** - * Recursive method that, given a Where clause, generates a boolean mask (a BitSet) applying the + * Recursive method that, given a Where clause, generates a boolean mask (a ModBitSet) applying the * clause to a DataFrame * * @throws MacroBaseSQLException Only comparison expressions (e.g., WHERE x = 42) and logical * AND/OR/NOT combinations of such expressions are supported; exception is thrown otherwise. */ - private BitSet getMask(DataFrame df, Expression whereClause) throws MacroBaseException { + private ModBitSet getMask(DataFrame df, Expression whereClause) throws MacroBaseException { if (whereClause instanceof NotExpression) { final NotExpression notExpr = (NotExpression) whereClause; - final BitSet mask = getMask(df, notExpr.getValue()); + final ModBitSet mask = getMask(df, notExpr.getValue()); mask.flip(0, df.getNumRows()); return mask; } else if (whereClause instanceof LogicalBinaryExpression) { final LogicalBinaryExpression binaryExpr = (LogicalBinaryExpression) whereClause; - final BitSet leftMask = getMask(df, binaryExpr.getLeft()); - final BitSet rightMask = getMask(df, binaryExpr.getRight()); + final ModBitSet leftMask = getMask(df, binaryExpr.getLeft()); + final ModBitSet rightMask = getMask(df, binaryExpr.getRight()); if (binaryExpr.getType() == Type.AND) { leftMask.and(rightMask); return leftMask; @@ -1130,7 +1131,7 @@ private BitSet getMask(DataFrame df, Expression whereClause) throws MacroBaseExc if (left instanceof Literal && right instanceof Literal) { final boolean val = left.equals(right); - final BitSet mask = new BitSet(df.getNumRows()); + final ModBitSet mask = new ModBitSet(df.getNumRows()); mask.set(0, df.getNumRows(), val); return mask; } else if (left instanceof Literal && right instanceof Identifier) { @@ -1146,7 +1147,7 @@ private BitSet getMask(DataFrame df, Expression whereClause) throws MacroBaseExc throw new MacroBaseSQLException("Boolean expression not supported"); } - private BitSet maskForPredicate(DataFrame df, FunctionCall func, Literal val, + private ModBitSet maskForPredicate(DataFrame df, FunctionCall func, Literal val, final ComparisonExpressionType type) throws MacroBaseException { final String funcName = func.getName().getSuffix(); @@ -1154,7 +1155,7 @@ private BitSet maskForPredicate(DataFrame df, FunctionCall func, Literal val, func.getArguments().stream().map(Expression::toString).findFirst().get()); final double[] col = mbFunction.apply(df); final DoublePredicate predicate = getPredicate(((DoubleLiteral) val).getValue(), type); - final BitSet mask = new BitSet(col.length); + final ModBitSet mask = new ModBitSet(col.length); for (int i = 0; i < col.length; ++i) { if (predicate.test(col[i])) { mask.set(i); @@ -1166,7 +1167,7 @@ private BitSet maskForPredicate(DataFrame df, FunctionCall func, Literal val, /** * The base case for {@link QueryEngine#getMask(DataFrame, Expression)}; returns a boolean mask - * (as a BitSet) for a single comparision expression (e.g., WHERE x = 42) + * (as a ModBitSet) for a single comparision expression (e.g., WHERE x = 42) * * @param df The DataFrame on which to evaluate the comparison expression * @param literal The constant argument in the expression (e.g., 42) @@ -1175,7 +1176,7 @@ private BitSet maskForPredicate(DataFrame df, FunctionCall func, Literal val, * @throws MacroBaseSQLException if the literal's type doesn't match the type of the column * variable, an exception is thrown */ - private BitSet maskForPredicate(final DataFrame df, final Literal literal, + private ModBitSet maskForPredicate(final DataFrame df, final Literal literal, final Identifier identifier, final ComparisonExpressionType compExprType) throws MacroBaseSQLException { final String colName = identifier.getValue(); diff --git a/sql/src/test/resources/diff-joins/create_sample_table.py b/sql/src/test/resources/diff-joins/create_sample_table.py old mode 100755 new mode 100644 diff --git a/sql/src/test/resources/joins/create_sample_table.py b/sql/src/test/resources/joins/create_sample_table.py old mode 100755 new mode 100644