Search in sources :

Example 1 with StandardDeviation

use of org.apache.commons.math.stat.descriptive.moment.StandardDeviation in project streamsx.topology by IBMStreams.

the class FindOutliers method main.

public static void main(String[] args) throws Exception {
    final double threshold = args.length == 0 ? 2.0 : Double.parseDouble(args[0]);
    Topology t = new Topology("StandardDeviationFilter");
    final Random rand = new Random();
    // Produce a stream of random double values with a normal
    // distribution, mean 0.0 and standard deviation 1.
    TStream<Double> values = t.limitedSource(new Supplier<Double>() {

        private static final long serialVersionUID = 1L;

        @Override
        public Double get() {
            return rand.nextGaussian();
        }
    }, 100000);
    /*
         * Filters the values based on calculating the mean and standard
         * deviation from the incoming data. In this case only outliers are
         * present in the output stream outliers. A outlier is defined as one
         * more than (threshold*standard deviation) from the mean.
         * 
         * This demonstrates an anonymous functional logic class that is
         * stateful. The two fields mean and sd maintain their values across
         * multiple invocations of the test method, that is for multiple tuples.
         * 
         * Note both Mean & StandardDeviation classes are serializable.
         */
    TStream<Double> outliers = values.filter(new Predicate<Double>() {

        private static final long serialVersionUID = 1L;

        private final Mean mean = new Mean();

        private final StandardDeviation sd = new StandardDeviation();

        @Override
        public boolean test(Double tuple) {
            mean.increment(tuple);
            sd.increment(tuple);
            double multpleSd = threshold * sd.getResult();
            double absMean = Math.abs(mean.getResult());
            double absTuple = Math.abs(tuple);
            return absTuple > absMean + multpleSd;
        }
    });
    outliers.print();
    StreamsContextFactory.getEmbedded().submit(t).get();
}
Also used : Mean(org.apache.commons.math.stat.descriptive.moment.Mean) Random(java.util.Random) StandardDeviation(org.apache.commons.math.stat.descriptive.moment.StandardDeviation) Topology(com.ibm.streamsx.topology.Topology)

Example 2 with StandardDeviation

use of org.apache.commons.math.stat.descriptive.moment.StandardDeviation in project drill by apache.

the class TestOrderedPartitionExchange method twoBitTwoExchangeRun.

/**
   * Starts two drillbits and runs a physical plan with a Mock scan, project, OrderedParititionExchange, Union Exchange,
   * and sort. The final sort is done first on the partition column, and verifies that the partitions are correct, in that
   * all rows in partition 0 should come in the sort order before any row in partition 1, etc. Also verifies that the standard
   * deviation of the size of the partitions is less than one tenth the mean size of the partitions, because we expect all
   * the partitions to be roughly equal in size.
   * @throws Exception
   */
@Test
public void twoBitTwoExchangeRun() throws Exception {
    RemoteServiceSet serviceSet = RemoteServiceSet.getLocalServiceSet();
    try (Drillbit bit1 = new Drillbit(CONFIG, serviceSet);
        Drillbit bit2 = new Drillbit(CONFIG, serviceSet);
        DrillClient client = new DrillClient(CONFIG, serviceSet.getCoordinator())) {
        bit1.run();
        bit2.run();
        client.connect();
        List<QueryDataBatch> results = client.runQuery(org.apache.drill.exec.proto.UserBitShared.QueryType.PHYSICAL, Files.toString(FileUtils.getResourceAsFile("/sender/ordered_exchange.json"), Charsets.UTF_8));
        int count = 0;
        List<Integer> partitionRecordCounts = Lists.newArrayList();
        for (QueryDataBatch b : results) {
            if (b.getData() != null) {
                int rows = b.getHeader().getRowCount();
                count += rows;
                DrillConfig config = DrillConfig.create();
                RecordBatchLoader loader = new RecordBatchLoader(new BootStrapContext(config, ClassPathScanner.fromPrescan(config)).getAllocator());
                loader.load(b.getHeader().getDef(), b.getData());
                BigIntVector vv1 = (BigIntVector) loader.getValueAccessorById(BigIntVector.class, loader.getValueVectorId(new SchemaPath("col1", ExpressionPosition.UNKNOWN)).getFieldIds()).getValueVector();
                Float8Vector vv2 = (Float8Vector) loader.getValueAccessorById(Float8Vector.class, loader.getValueVectorId(new SchemaPath("col2", ExpressionPosition.UNKNOWN)).getFieldIds()).getValueVector();
                IntVector pVector = (IntVector) loader.getValueAccessorById(IntVector.class, loader.getValueVectorId(new SchemaPath("partition", ExpressionPosition.UNKNOWN)).getFieldIds()).getValueVector();
                long previous1 = Long.MIN_VALUE;
                double previous2 = Double.MIN_VALUE;
                int partPrevious = -1;
                long current1 = Long.MIN_VALUE;
                double current2 = Double.MIN_VALUE;
                int partCurrent = -1;
                int partitionRecordCount = 0;
                for (int i = 0; i < rows; i++) {
                    previous1 = current1;
                    previous2 = current2;
                    partPrevious = partCurrent;
                    current1 = vv1.getAccessor().get(i);
                    current2 = vv2.getAccessor().get(i);
                    partCurrent = pVector.getAccessor().get(i);
                    Assert.assertTrue(current1 >= previous1);
                    if (current1 == previous1) {
                        Assert.assertTrue(current2 <= previous2);
                    }
                    if (partCurrent == partPrevious || partPrevious == -1) {
                        partitionRecordCount++;
                    } else {
                        partitionRecordCounts.add(partitionRecordCount);
                        partitionRecordCount = 0;
                    }
                }
                partitionRecordCounts.add(partitionRecordCount);
                loader.clear();
            }
            b.release();
        }
        double[] values = new double[partitionRecordCounts.size()];
        int i = 0;
        for (Integer rc : partitionRecordCounts) {
            values[i++] = rc.doubleValue();
        }
        StandardDeviation stdDev = new StandardDeviation();
        Mean mean = new Mean();
        double std = stdDev.evaluate(values);
        double m = mean.evaluate(values);
        System.out.println("mean: " + m + " std dev: " + std);
        //Assert.assertTrue(std < 0.1 * m);
        assertEquals(31000, count);
    }
}
Also used : Mean(org.apache.commons.math.stat.descriptive.moment.Mean) BigIntVector(org.apache.drill.exec.vector.BigIntVector) IntVector(org.apache.drill.exec.vector.IntVector) RecordBatchLoader(org.apache.drill.exec.record.RecordBatchLoader) Float8Vector(org.apache.drill.exec.vector.Float8Vector) BigIntVector(org.apache.drill.exec.vector.BigIntVector) QueryDataBatch(org.apache.drill.exec.rpc.user.QueryDataBatch) DrillConfig(org.apache.drill.common.config.DrillConfig) Drillbit(org.apache.drill.exec.server.Drillbit) SchemaPath(org.apache.drill.common.expression.SchemaPath) RemoteServiceSet(org.apache.drill.exec.server.RemoteServiceSet) BootStrapContext(org.apache.drill.exec.server.BootStrapContext) StandardDeviation(org.apache.commons.math.stat.descriptive.moment.StandardDeviation) DrillClient(org.apache.drill.exec.client.DrillClient) Test(org.junit.Test)

Aggregations

Mean (org.apache.commons.math.stat.descriptive.moment.Mean)2 StandardDeviation (org.apache.commons.math.stat.descriptive.moment.StandardDeviation)2 Topology (com.ibm.streamsx.topology.Topology)1 Random (java.util.Random)1 DrillConfig (org.apache.drill.common.config.DrillConfig)1 SchemaPath (org.apache.drill.common.expression.SchemaPath)1 DrillClient (org.apache.drill.exec.client.DrillClient)1 RecordBatchLoader (org.apache.drill.exec.record.RecordBatchLoader)1 QueryDataBatch (org.apache.drill.exec.rpc.user.QueryDataBatch)1 BootStrapContext (org.apache.drill.exec.server.BootStrapContext)1 Drillbit (org.apache.drill.exec.server.Drillbit)1 RemoteServiceSet (org.apache.drill.exec.server.RemoteServiceSet)1 BigIntVector (org.apache.drill.exec.vector.BigIntVector)1 Float8Vector (org.apache.drill.exec.vector.Float8Vector)1 IntVector (org.apache.drill.exec.vector.IntVector)1 Test (org.junit.Test)1