Skip to content

Commit

Permalink
better build script, UDFs in WHERE clauses, column-wise printing for …
Browse files Browse the repository at this point in the history
…wide DataFrames, and output piped to less (and miscellaneous bugs)
  • Loading branch information
fabuzaid21 committed Feb 9, 2018
1 parent 16235f6 commit 79a967e
Show file tree
Hide file tree
Showing 11 changed files with 255 additions and 87 deletions.
25 changes: 24 additions & 1 deletion build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,27 @@

set -e

cd lib && mvn clean && mvn install && cd ../sql && mvn clean && mvn package -DskipTests
build_module () {
pushd $1
mvn clean && mvn package -DskipTests
popd
}

pushd lib/
mvn clean && mvn install -DskipTests
popd

if [[ $# -eq 0 ]]; then
build_module core sql
else
while [[ $# -gt 0 ]]
do
if [ -e "$1"/pom.xml ]; then
build_module $1
else
echo "$1 does not contain a module"
fi
shift # past argument
done
fi

Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
package edu.stanford.futuredata.macrobase.analysis.classify;

import com.google.common.base.Joiner;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.operator.Transformer;
import edu.stanford.futuredata.macrobase.util.MacrobaseException;
import java.lang.reflect.InvocationTargetException;
import java.util.BitSet;
import java.util.List;

public abstract class Classifier implements Transformer {
Expand Down Expand Up @@ -37,6 +39,8 @@ public Classifier setOutputColumnName(String outputColumnName) {
return this;
}

public abstract BitSet getMask(final DataFrame input);

public static Classifier getClassifier(String classifierType, List<String> args)
throws MacrobaseException {
Class<? extends Classifier> clazz;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.util.MacrobaseException;
import java.util.BitSet;
import java.util.List;
import org.apache.commons.math3.stat.descriptive.rank.Percentile;

Expand Down Expand Up @@ -34,7 +35,8 @@ public PercentileClassifier(String columnName) {
*/
public PercentileClassifier(List<String> attrs) throws MacrobaseException {
this(attrs.get(0));
percentile = Double.parseDouble(attrs.get(1));
percentile = 100 * (1 - Double
.parseDouble(attrs.get(1))); // TODO: this is stupid -- we need to standardize this
includeHigh =
(attrs.size() <= 2) || Boolean
.parseBoolean(attrs.get(2)); // 3rd arg if present else true
Expand Down Expand Up @@ -63,6 +65,25 @@ public void process(DataFrame input) {
output.addColumn(outputColumnName, resultColumn);
}

@Override
public BitSet getMask(DataFrame input) {
final double[] inputCol = input.getDoubleColumnByName(columnName);
final int numRows = inputCol.length;
lowCutoff = new Percentile().evaluate(inputCol, percentile);
highCutoff = new Percentile().evaluate(inputCol, 100.0 - percentile);
final BitSet mask = new BitSet(numRows);

for (int i = 0; i < numRows; i++) {
double curVal = inputCol[i];
if ((curVal > highCutoff && includeHigh)
|| (curVal < lowCutoff && includeLow)
) {
mask.set(i);
}
}
return mask;
}

@Override
public DataFrame getResults() {
return output;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,24 +3,25 @@
import edu.stanford.futuredata.macrobase.analysis.classify.stats.MBPredicate;
import edu.stanford.futuredata.macrobase.datamodel.DataFrame;
import edu.stanford.futuredata.macrobase.util.MacrobaseException;

import java.util.BitSet;
import java.util.List;
import java.util.function.DoublePredicate;
import java.util.function.Predicate;

/**
* PredicateClassifier classifies an outlier based on a predicate(e.g., equality, less than, greater than)
* and a hard-coded sentinel value. Unlike {@link PercentileClassifier}, outlier values are not determined based on a
* proportion of the values in the metric column. Instead, the outlier values are defined explicitly by the user in the
* conf.yaml file; for example:
* PredicateClassifier classifies an outlier based on a predicate(e.g., equality, less than, greater
* than) and a hard-coded sentinel value. Unlike {@link PercentileClassifier}, outlier values are
* not determined based on a proportion of the values in the metric column. Instead, the outlier
* values are defined explicitly by the user in the conf.yaml file; for example:
* <code>
* classifier: "raw_threshold"
* metric: "usage"
* predicate: "=="
* value: 1.0
* </code>
* This would instantiate a PredicateClassifier that classifies every value in the "usage" column equal to 1.0
* as an outlier. Currently, we support six different predicates: "==", "!=", "<", ">", "<=", and ">=".
* This would instantiate a PredicateClassifier that classifies every value in the "usage" column
* equal to 1.0 as an outlier. Currently, we support six different predicates:
* "==", "!=", "<", ">", "<=", and ">=".
*/
public class PredicateClassifier extends Classifier {

Expand All @@ -34,10 +35,10 @@ public class PredicateClassifier extends Classifier {
* @param columnName Column on which to classifier outliers
* @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">="
* @param sentinel Sentinel value used when evaluating the predicate to determine outlier
* @throws MacrobaseException
*/
public PredicateClassifier(final String columnName, final String predicateStr, final double sentinel)
throws MacrobaseException {
public PredicateClassifier(final String columnName, final String predicateStr,
final double sentinel)
throws MacrobaseException {
super(columnName);
this.predicate = MBPredicate.getDoublePredicate(predicateStr, sentinel);
this.isStrPredicate = false;
Expand All @@ -48,10 +49,10 @@ public PredicateClassifier(final String columnName, final String predicateStr, f
* @param columnName Column on which to classifier outliers
* @param predicateStr Predicate used for classification: "==", "!=", "<", ">", "<=", or ">="
* @param sentinel Sentinel value used when evaluating the predicate to determine outlier
* @throws MacrobaseException
*/
public PredicateClassifier(final String columnName, final String predicateStr, final String sentinel)
throws MacrobaseException {
public PredicateClassifier(final String columnName, final String predicateStr,
final String sentinel)
throws MacrobaseException {
super(columnName);
this.strPredicate = MBPredicate.getStrPredicate(predicateStr, sentinel);
this.isStrPredicate = true;
Expand All @@ -61,8 +62,8 @@ public PredicateClassifier(final String columnName, final String predicateStr, f
* Alternate constructor that takes in List of Strings; used to instantiate Classifier (via
* reflection) specified in MacroBase SQL query
*
* @param attrs by convention, should be a List that has 3 values: [outlier_col_name,
* predicate type ("==","!=", etc.), sentinel (either String or double)]
* @param attrs by convention, should be a List that has 3 values: [outlier_col_name, predicate
* type ("==","!=", etc.), sentinel (either String or double)]
*/
public PredicateClassifier(final List<String> attrs) throws MacrobaseException {
super(attrs.get(0));
Expand All @@ -80,25 +81,54 @@ public PredicateClassifier(final List<String> attrs) throws MacrobaseException {
}
}

@Override
public BitSet getMask(final DataFrame input) {
if (isStrPredicate) {
final String[] metrics = input.getStringColumnByName(columnName);
return getMask(metrics, strPredicate);
} else {
final double[] metrics = input.getDoubleColumnByName(columnName);
return getMask(metrics, predicate);
}
}

private BitSet getMask(final double[] metrics, final DoublePredicate predicate) {
final int numRows = metrics.length;
final BitSet mask = new BitSet(numRows);
for (int i = 0; i < numRows; i++) {
if (predicate.test(metrics[i])) {
mask.set(i);
}
}
return mask;
}

private BitSet getMask(final String[] metrics, final Predicate<String> predicate) {
final int numRows = metrics.length;
final BitSet mask = new BitSet(numRows);
for (int i = 0; i < numRows; i++) {
if (predicate.test(metrics[i])) {
mask.set(i);
}
}
return mask;
}

/**
* Scan through the metric column, and evaluate the predicate on every value in the column. The ``input'' DataFrame
* remains unmodified; a copy is created and all modifications are made on the copy.
* @throws Exception
* Scan through the metric column, and evaluate the predicate on every value in the column. The
* ``input'' DataFrame remains unmodified; a copy is created and all modifications are made on
* the copy.
*/
@Override
public void process(DataFrame input) throws Exception {
if (isStrPredicate) {
processString(input);
}
else {
} else {
processDouble(input);
}
}


public void processDouble(DataFrame input) throws Exception {
private void processDouble(DataFrame input) throws Exception {
double[] metrics = input.getDoubleColumnByName(columnName);
int len = metrics.length;
output = input.copy();
Expand All @@ -114,8 +144,7 @@ public void processDouble(DataFrame input) throws Exception {
output.addColumn(outputColumnName, resultColumn);
}


public void processString(DataFrame input) throws Exception {
private void processString(DataFrame input) throws Exception {
String[] metrics = input.getStringColumnByName(columnName);
int len = metrics.length;
output = input.copy();
Expand All @@ -136,4 +165,5 @@ public void processString(DataFrame input) throws Exception {
public DataFrame getResults() {
return output;
}

}
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@
* initialized from a schema and a set of rows.
*/
public class DataFrame {
private Schema schema;

private static final int MAX_COLS_FOR_TABULAR_PRINT = 10;

private Schema schema;
private ArrayList<String[]> stringCols;
private ArrayList<double[]> doubleCols;
// external indices define a global ordering on columns, but internally each
Expand Down Expand Up @@ -142,38 +144,52 @@ public String toString() {
* | val_m1 | val_m2 | ... | val_mn |
* ------------------------------------------
* @param out PrintStream to write to STDOUT or file (default: STDOUT)
* @param maxNumToPrint maximum number of rows from the DataFrame to print (default: 15)
* @param maxNumToPrint maximum number of rows from the DataFrame to print (default: -1, i.e.,
* all rows)
*/
public void prettyPrint(final PrintStream out, final int maxNumToPrint) {
out.println(numRows + (numRows == 1 ? " row" : " rows"));
out.println();

final int maxColNameLength = schema.getColumnNames().stream()
.reduce("", (x, y) -> x.length() > y.length() ? x : y).length() + 4; // 2 extra spaces on both sides
final String schemaStr = "|" + Joiner.on("|").join(schema.getColumnNames().stream()
.map((x) -> StringUtils.center(String.valueOf(x), maxColNameLength)).collect(toList())) + "|";
final String dashes = Joiner.on("").join(Collections.nCopies(schemaStr.length(), "-"));
out.println(dashes);
out.println(schemaStr);
out.println(dashes);

if (numRows > maxNumToPrint) {
final int numToPrint = maxNumToPrint / 2;
for (Row r : getRows(0, numToPrint)) {
r.prettyPrint(out, maxColNameLength);
}
out.println();
out.println("...");
out.println();
for (Row r : getRows(numRows - numToPrint, numRows)) {
r.prettyPrint(out, maxColNameLength);
.reduce("", (x, y) -> x.length() > y.length() ? x : y).length();

if (schema.getNumColumns() > MAX_COLS_FOR_TABULAR_PRINT) {
// print each row so that each value is on a separate line
for (Row r : getRows()) {
r.prettyPrintColumnWise(out, maxColNameLength);
}
} else {
for (Row r : getRows()) {
r.prettyPrint(out, maxColNameLength);
// print DataFrame as a table
final int tableWidth =
maxColNameLength + 4; // 2 extra spaces on both sides of each column name and value
final List<String> colStrs = schema.getColumnNames().stream()
.map((x) -> StringUtils.center(String.valueOf(x), tableWidth)).collect(toList());
final String schemaStr = "|" + Joiner.on("|").join(colStrs) + "|";
final String dashes = Joiner.on("").join(Collections.nCopies(schemaStr.length(), "-"));
out.println(dashes);
out.println(schemaStr);
out.println(dashes);

if (maxNumToPrint > 0 && numRows > maxNumToPrint) {
final int numToPrint = maxNumToPrint / 2;
for (Row r : getRows(0, numToPrint)) {
r.prettyPrint(out, tableWidth);
}
out.println();
out.println("...");
out.println();
for (Row r : getRows(numRows - numToPrint, numRows)) {
r.prettyPrint(out, tableWidth);
}
} else {
for (Row r : getRows()) {
r.prettyPrint(out, tableWidth);
}
}
out.println(dashes);
out.println();
}
out.println(dashes);
out.println();
}

/**
Expand Down
Loading

0 comments on commit 79a967e

Please sign in to comment.