From 5370e945a900093159c45d6edc2830c425283515 Mon Sep 17 00:00:00 2001 From: Firas Abuzaid Date: Wed, 24 Jan 2018 09:43:09 -0800 Subject: [PATCH] Cleanup --- .../macrobase/analysis/MBFunction.java | 3 +- .../macrobase/datamodel/DataFrame.java | 10 ++-- sql/load.sql | 11 ++++ .../futuredata/macrobase/sql/QueryEngine.java | 59 +++++++++---------- .../macrobase/sql/tree/SingleColumn.java | 22 ++++++- 5 files changed, 66 insertions(+), 39 deletions(-) diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/MBFunction.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/MBFunction.java index 42b3e6cdd..710e45542 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/MBFunction.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/analysis/MBFunction.java @@ -125,7 +125,6 @@ protected void applyFunction(final double[] inputCol, final double[] outputCol) } } - /** * An MBFunction that finds the percentile for each individual value in a given column. For example, * for a column with values [0.1, 0.3, 0.2, 0.5, 0.4], applying the PercentileFunction would @@ -148,7 +147,7 @@ public PercentileFucntion(final String arg) { protected void applyFunction(final double[] inputCol, final double[] outputCol) { // sort the column, and, for each value in the column, store the *min* position in the sorted array final double[] sortedInputCol = Arrays.stream(inputCol).sorted().toArray(); - Map map = new HashMap<>(); + final Map map = new HashMap<>(); for (int i = sortedInputCol.length - 1; i >= 0; --i) { // increment by one so that the max value has 100th percentile map.put(sortedInputCol[i], i + 1); diff --git a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java index 75bcfaa1b..88c5df4e9 100644 --- a/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java +++ b/lib/src/main/java/edu/stanford/futuredata/macrobase/datamodel/DataFrame.java @@ -161,7 +161,9 @@ public void prettyPrint(final PrintStream out, final int maxNumToPrint) { for (Row r : getRows(0, numToPrint)) { r.prettyPrint(out, maxColNameLength); } + out.println(); out.println("..."); + out.println(); for (Row r : getRows(numRows - numToPrint, numRows)) { r.prettyPrint(out, maxColNameLength); } @@ -176,17 +178,17 @@ public void prettyPrint(final PrintStream out, final int maxNumToPrint) { /** * {@link #prettyPrint(PrintStream, int)} with default out set to System.out - * and maxNumToPrint set to 25 + * and maxNumToPrint set to 15 */ public void prettyPrint() { - prettyPrint(System.out, 25); + prettyPrint(System.out, 15); } /** - * {@link #prettyPrint(PrintStream, int)} with default maxNumToPrint set to 25 + * {@link #prettyPrint(PrintStream, int)} with default maxNumToPrint set to 15 */ public void prettyPrint(final PrintStream out) { - prettyPrint(out, 25); + prettyPrint(out, 15); } /** diff --git a/sql/load.sql b/sql/load.sql index 2992d7b40..3bbc83106 100644 --- a/sql/load.sql +++ b/sql/load.sql @@ -1,3 +1,14 @@ IMPORT FROM CSV FILE 'core/demo/sample.csv' INTO sample(usage double, latency double, location string, version string); +IMPORT FROM CSV FILE 'core/demo/mobile_data.csv' INTO mobile_data(record_id + string, user_id string, state string, hw_make string, hw_model string, + firmware_version string, app_version string, avg_temp double, battery_drain + double, trip_time double); + +IMPORT FROM CSV FILE 'wikiticker.csv' INTO wiki(time string, user string, page + string, channel string, namespace string, comment string, metroCode string, + cityName string, regionName string, regionIsoCode string, countryName string, + countryIsoCode string, isAnonymous string, isMinor string, isNew string, + isRobot string, isUnpatrolled string, delta double, added double, deleted + double); diff --git a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java index 223e5d1e5..5753e4d40 100644 --- a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java +++ b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/QueryEngine.java @@ -22,7 +22,6 @@ import edu.stanford.futuredata.macrobase.sql.tree.FunctionCall; import edu.stanford.futuredata.macrobase.sql.tree.Identifier; import edu.stanford.futuredata.macrobase.sql.tree.ImportCsv; -import edu.stanford.futuredata.macrobase.sql.tree.IsNotNullPredicate; import edu.stanford.futuredata.macrobase.sql.tree.Literal; import edu.stanford.futuredata.macrobase.sql.tree.LogicalBinaryExpression; import edu.stanford.futuredata.macrobase.sql.tree.LogicalBinaryExpression.Type; @@ -32,7 +31,6 @@ import edu.stanford.futuredata.macrobase.sql.tree.QueryBody; import edu.stanford.futuredata.macrobase.sql.tree.QuerySpecification; import edu.stanford.futuredata.macrobase.sql.tree.Relation; -import edu.stanford.futuredata.macrobase.sql.tree.Select; import edu.stanford.futuredata.macrobase.sql.tree.SelectItem; import edu.stanford.futuredata.macrobase.sql.tree.SingleColumn; import edu.stanford.futuredata.macrobase.sql.tree.SortItem; @@ -234,7 +232,7 @@ private List removeUDFsInSelect(List selectItems) { final SingleColumn col = (SingleColumn) item; if (col.getExpression() instanceof FunctionCall) { functionCalls.add(col); - it.remove(); + it.remove(); } } } @@ -280,8 +278,21 @@ private DataFrame evaluateSQLClauses(final QueryBody query, final DataFrame df) // clauses, and then a second with just the original projections. That should be correct // and give us better performance. - DataFrame resultDf = evaluateSelectClause(df, query.getSelect()); - resultDf = evaluateWhereClause(resultDf, query.getWhere()); + final List selectWithoutUdfs = Lists + .newArrayList(query.getSelect().getSelectItems()); + final List udfCols = removeUDFsInSelect(selectWithoutUdfs); + // selectWithoutUdfs has now been modified so that it no longer has UDFs + + // create shallow copy, so modifications don't persist on the original DataFrame + DataFrame resultDf = df.copy(); + final Map newColumns = evaluateUDFs(resultDf, udfCols); + + resultDf = evaluateWhereClause(df, query.getWhere()); + resultDf = evaluateSelectClause(resultDf, selectWithoutUdfs); + for (Map.Entry newColumn : newColumns.entrySet()) { + // add UDF columns to result + resultDf.addColumn((String) newColumn.getKey(), (double[]) newColumn.getValue()); + } resultDf = evaluateOrderByClause(resultDf, query.getOrderBy()); return evaluateLimitClause(resultDf, query.getLimit()); } @@ -330,52 +341,40 @@ private DataFrame getTable(String tableName) throws MacrobaseSQLException { } /** - * Evaluate Select clause of SQL query. If the clause is 'SELECT *' the same DataFrame is - * returned unchanged. UDFs in the Select clause are also evaluated here. TODO: add support for - * DISTINCT queries + * Evaluate only the UDFs of SQL query and return a Map of column names -> double arrays. + * If there are no UDFs (i.e. @param udfCols is empty), the Map is empty. * - * @param inputDf The DataFrame to apply the Select clause on - * @param select The Select clause to evaluate - * @return A new DataFrame with the result of the Select clause applied + * @param inputDf The DataFrame to evaluate the UDFs on + * @param udfCols The List of UDFs to evaluate + * @return The Map of new columns to be added */ - private DataFrame evaluateSelectClause(final DataFrame inputDf, final Select select) + private Map evaluateUDFs(final DataFrame inputDf, final List udfCols) throws MacrobaseException { - List items = Lists.newArrayList(select.getSelectItems()); - List udfCols = removeUDFsInSelect( - items); - // items has now been modified so that it no longer has UDFs - final DataFrame dfWithNoUdfs = evaluateSelectWithNoUDFs(inputDf, items); - // create shallow copy, so modifications don't - final DataFrame resultDf = dfWithNoUdfs.copy(); + final Map newColumns = new HashMap<>(); for (SingleColumn udfCol : udfCols) { final FunctionCall func = (FunctionCall) udfCol.getExpression(); - // for now, if UDFs is a.b.c.d(), ignore "a.b.c." + // for now, if UDF is a.b.c.d(), ignore "a.b.c." final String funcName = func.getName().getSuffix(); // for now, assume func.getArguments returns at least 1 argument, always grab the first final MBFunction mbFunction = MBFunction.getFunction(funcName, func.getArguments().stream().map(Expression::toString).findFirst().get()); - // column name for the UDF is either 1) the user-provided alias, or 2) the function name - // and arguments concatenated by "_" - final String colName = udfCol.getAlias().map(Identifier::toString).orElse( - funcName + "_" + Joiner.on("_") - .join( - func.getArguments().stream().map(Expression::toString).collect(toList()))); // modify resultDf in place, add column; mbFunction is evaluated on input DataFrame - resultDf.addColumn(colName, mbFunction.apply(inputDf)); + newColumns.put(udfCol.toString(), mbFunction.apply(inputDf)); } - return resultDf; + return newColumns; } /** * Evaluate Select clause of SQL query, but only once all UDFs from the clause have been - * removed. If the clause is 'SELECT *' the same DataFrame is returned unchanged. + * removed. If the clause is 'SELECT *' the same DataFrame is returned unchanged. TODO: add + * support for DISTINCT queries * * @param df The DataFrame to apply the Select clause on * @param items The list of individual columns included in the Select clause * @return A new DataFrame with the result of the Select clause applied */ - private DataFrame evaluateSelectWithNoUDFs(DataFrame df, List items) { + private DataFrame evaluateSelectClause(DataFrame df, List items) { if (items.size() == 1 && items.get(0) instanceof AllColumns) { // SELECT * -> relation is unchanged return df; diff --git a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/tree/SingleColumn.java b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/tree/SingleColumn.java index 3f8faebc9..8e1c39c2a 100644 --- a/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/tree/SingleColumn.java +++ b/sql/src/main/java/edu/stanford/futuredata/macrobase/sql/tree/SingleColumn.java @@ -15,10 +15,12 @@ import static java.util.Objects.requireNonNull; +import com.google.common.base.Joiner; import com.google.common.collect.ImmutableList; import java.util.List; import java.util.Objects; import java.util.Optional; +import java.util.stream.Collector; public class SingleColumn extends SelectItem { @@ -79,11 +81,25 @@ public int hashCode() { @Override public String toString() { - if (alias.isPresent()) { - return expression.toString() + " " + alias.get(); + // column name for the UDF is either 1) the user-provided alias, or 2) the function name + // and arguments concatenated by "_" + return alias.map(Identifier::toString).orElseGet(() -> formatForCol(expression)); + } + + /** + * @return If the Expression is a Function Call (e.g., a UDF), concatenate the function name + * and the arguments with "_". Otherwise, return the output of toString() + */ + private String formatForCol(final Expression expr) { + if (expr instanceof FunctionCall) { + final FunctionCall func = (FunctionCall) expr; + // for now, if UDF is a.b.c.d(), ignore "a.b.c." + final String funcName = func.getName().getSuffix(); + return funcName + "_" + Joiner.on("_") + .join(func.getArguments().stream().map(Expression::toString).iterator()); } + return expr.toString(); - return expression.toString(); } @Override