use of org.apache.commons.math.stat.descriptive.moment.StandardDeviation in project streamsx.topology by IBMStreams.
the class FindOutliers method main.
public static void main(String[] args) throws Exception {
final double threshold = args.length == 0 ? 2.0 : Double.parseDouble(args[0]);
Topology t = new Topology("StandardDeviationFilter");
final Random rand = new Random();
// Produce a stream of random double values with a normal
// distribution, mean 0.0 and standard deviation 1.
TStream<Double> values = t.limitedSource(new Supplier<Double>() {
private static final long serialVersionUID = 1L;
@Override
public Double get() {
return rand.nextGaussian();
}
}, 100000);
/*
* Filters the values based on calculating the mean and standard
* deviation from the incoming data. In this case only outliers are
* present in the output stream outliers. A outlier is defined as one
* more than (threshold*standard deviation) from the mean.
*
* This demonstrates an anonymous functional logic class that is
* stateful. The two fields mean and sd maintain their values across
* multiple invocations of the test method, that is for multiple tuples.
*
* Note both Mean & StandardDeviation classes are serializable.
*/
TStream<Double> outliers = values.filter(new Predicate<Double>() {
private static final long serialVersionUID = 1L;
private final Mean mean = new Mean();
private final StandardDeviation sd = new StandardDeviation();
@Override
public boolean test(Double tuple) {
mean.increment(tuple);
sd.increment(tuple);
double multpleSd = threshold * sd.getResult();
double absMean = Math.abs(mean.getResult());
double absTuple = Math.abs(tuple);
return absTuple > absMean + multpleSd;
}
});
outliers.print();
StreamsContextFactory.getEmbedded().submit(t).get();
}
use of org.apache.commons.math.stat.descriptive.moment.StandardDeviation in project drill by apache.
the class TestOrderedPartitionExchange method twoBitTwoExchangeRun.
/**
* Starts two drillbits and runs a physical plan with a Mock scan, project, OrderedParititionExchange, Union Exchange,
* and sort. The final sort is done first on the partition column, and verifies that the partitions are correct, in that
* all rows in partition 0 should come in the sort order before any row in partition 1, etc. Also verifies that the standard
* deviation of the size of the partitions is less than one tenth the mean size of the partitions, because we expect all
* the partitions to be roughly equal in size.
* @throws Exception
*/
@Test
public void twoBitTwoExchangeRun() throws Exception {
RemoteServiceSet serviceSet = RemoteServiceSet.getLocalServiceSet();
try (Drillbit bit1 = new Drillbit(CONFIG, serviceSet);
Drillbit bit2 = new Drillbit(CONFIG, serviceSet);
DrillClient client = new DrillClient(CONFIG, serviceSet.getCoordinator())) {
bit1.run();
bit2.run();
client.connect();
List<QueryDataBatch> results = client.runQuery(org.apache.drill.exec.proto.UserBitShared.QueryType.PHYSICAL, Files.toString(FileUtils.getResourceAsFile("/sender/ordered_exchange.json"), Charsets.UTF_8));
int count = 0;
List<Integer> partitionRecordCounts = Lists.newArrayList();
for (QueryDataBatch b : results) {
if (b.getData() != null) {
int rows = b.getHeader().getRowCount();
count += rows;
DrillConfig config = DrillConfig.create();
RecordBatchLoader loader = new RecordBatchLoader(new BootStrapContext(config, ClassPathScanner.fromPrescan(config)).getAllocator());
loader.load(b.getHeader().getDef(), b.getData());
BigIntVector vv1 = (BigIntVector) loader.getValueAccessorById(BigIntVector.class, loader.getValueVectorId(new SchemaPath("col1", ExpressionPosition.UNKNOWN)).getFieldIds()).getValueVector();
Float8Vector vv2 = (Float8Vector) loader.getValueAccessorById(Float8Vector.class, loader.getValueVectorId(new SchemaPath("col2", ExpressionPosition.UNKNOWN)).getFieldIds()).getValueVector();
IntVector pVector = (IntVector) loader.getValueAccessorById(IntVector.class, loader.getValueVectorId(new SchemaPath("partition", ExpressionPosition.UNKNOWN)).getFieldIds()).getValueVector();
long previous1 = Long.MIN_VALUE;
double previous2 = Double.MIN_VALUE;
int partPrevious = -1;
long current1 = Long.MIN_VALUE;
double current2 = Double.MIN_VALUE;
int partCurrent = -1;
int partitionRecordCount = 0;
for (int i = 0; i < rows; i++) {
previous1 = current1;
previous2 = current2;
partPrevious = partCurrent;
current1 = vv1.getAccessor().get(i);
current2 = vv2.getAccessor().get(i);
partCurrent = pVector.getAccessor().get(i);
Assert.assertTrue(current1 >= previous1);
if (current1 == previous1) {
Assert.assertTrue(current2 <= previous2);
}
if (partCurrent == partPrevious || partPrevious == -1) {
partitionRecordCount++;
} else {
partitionRecordCounts.add(partitionRecordCount);
partitionRecordCount = 0;
}
}
partitionRecordCounts.add(partitionRecordCount);
loader.clear();
}
b.release();
}
double[] values = new double[partitionRecordCounts.size()];
int i = 0;
for (Integer rc : partitionRecordCounts) {
values[i++] = rc.doubleValue();
}
StandardDeviation stdDev = new StandardDeviation();
Mean mean = new Mean();
double std = stdDev.evaluate(values);
double m = mean.evaluate(values);
System.out.println("mean: " + m + " std dev: " + std);
//Assert.assertTrue(std < 0.1 * m);
assertEquals(31000, count);
}
}
Aggregations