Search in sources :

Example 46 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirSamplingTest method execTest.

@Test
public void execTest() throws IOException {
    // copies tests for accumulate() since that handles both data paths
    final int k = 32;
    final long n = 24;
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    final TupleFactory tf = TupleFactory.getInstance();
    for (long i = 0; i < n; ++i) {
        final Tuple t = tf.newTuple(2);
        t.set(0, i);
        t.set(1, Long.toString(-i));
        inputBag.add(t);
    }
    final Tuple input = tf.newTuple(inputBag);
    final ReservoirSampling rs = new ReservoirSampling(Integer.toString(k));
    Tuple result = rs.exec(input);
    assertEquals(result.size(), 3, "Incorrect output size");
    assertEquals(result.get(0), n, "Incorrect number of samples seen");
    assertEquals(result.get(1), k, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), n);
    // add another n to the bag and repeat
    for (long i = n; i < 2 * n; ++i) {
        final Tuple t = tf.newTuple(2);
        t.set(0, i);
        t.set(1, Long.toString(-i));
        inputBag.add(t);
    }
    result = rs.exec(input);
    assertEquals(result.get(0), 2 * n, "Incorrect number of samples seen");
    // unchanged
    assertEquals(result.get(1), k, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), Math.min(k, 2 * n));
}
Also used : DataBag(org.apache.pig.data.DataBag) TupleFactory(org.apache.pig.data.TupleFactory) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 47 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirSamplingTest method accumulateTest.

@Test
public void accumulateTest() throws IOException {
    // exec() is automatically composed by calling accumulate(), getValue(), and cleanup(), in order
    // since AccumulateEvalFunc, but includes a fast-return route so still need to test separately
    final int k = 32;
    final long n = 24;
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    final TupleFactory tf = TupleFactory.getInstance();
    for (long i = 0; i < n; ++i) {
        final Tuple t = tf.newTuple(2);
        t.set(0, i);
        t.set(1, Long.toString(-i));
        inputBag.add(t);
    }
    final Tuple input = tf.newTuple(inputBag);
    final ReservoirSampling rs = new ReservoirSampling(Integer.toString(k));
    rs.accumulate(input);
    Tuple result = rs.getValue();
    assertEquals(result.size(), 3, "Incorrect output size");
    assertEquals(result.get(0), n, "Incorrect number of samples seen");
    assertEquals(result.get(1), k, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), n);
    // run the same input through again
    rs.accumulate(input);
    result = rs.getValue();
    assertEquals(result.get(0), 2 * n, "Incorrect number of samples seen");
    // unchanged
    assertEquals(result.get(1), k, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), Math.min(k, 2 * n));
    // clean up, degenerate accumulate, then get value again
    rs.cleanup();
    rs.accumulate(null);
    assertNull(rs.getValue());
}
Also used : DataBag(org.apache.pig.data.DataBag) TupleFactory(org.apache.pig.data.TupleFactory) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 48 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirSamplingTest method generateDataBag.

static DataBag generateDataBag(final long numItems, final int startIdx) {
    final DataBag output = BagFactory.getInstance().newDefaultBag();
    try {
        for (int i = 0; i < numItems; ++i) {
            final Tuple t = TupleFactory.getInstance().newTuple(2);
            final int val = startIdx + i;
            t.set(0, val);
            t.set(1, Integer.toString(-val));
            output.add(t);
        }
    } catch (final ExecException e) {
        fail(e.getMessage());
    }
    return output;
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) Tuple(org.apache.pig.data.Tuple)

Example 49 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirSamplingTest method intermediateFinalExec.

@Test
public void intermediateFinalExec() throws IOException {
    final int maxK = 128;
    final EvalFunc<Tuple> rs = new ReservoirSampling.IntermediateFinal(Integer.toString(maxK));
    // need at least 3 conditions:
    // 1. n <= k <= maxK
    // 2. n <= k, k > maxK
    // 3. n > k
    final DataBag bagOfReservoirs = BagFactory.getInstance().newDefaultBag();
    Tuple t = TupleFactory.getInstance().newTuple(3);
    t.set(0, 32L);
    t.set(1, maxK);
    t.set(2, generateDataBag(32, 0));
    bagOfReservoirs.add(t);
    t = TupleFactory.getInstance().newTuple(3);
    t.set(0, 64L);
    t.set(1, 256);
    t.set(2, generateDataBag(64, 32));
    bagOfReservoirs.add(t);
    t = TupleFactory.getInstance().newTuple(3);
    t.set(0, 256L);
    t.set(1, maxK);
    t.set(2, generateDataBag(maxK, 96));
    bagOfReservoirs.add(t);
    final Tuple input = TupleFactory.getInstance().newTuple(1);
    input.set(0, bagOfReservoirs);
    final Tuple result = rs.exec(input);
    final long tgtN = 32 + 64 + 256;
    // only added maxK to last bag
    final int tgtMaxVal = 32 + 64 + maxK;
    assertEquals(result.size(), 3, "Incorrect output size");
    assertEquals(result.get(0), tgtN, "Incorrect number of samples seen");
    assertEquals(result.get(1), maxK, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), maxK);
    // check that they're all in the target range
    for (Tuple sample : ((DataBag) result.get(2))) {
        final int val = (int) sample.get(0);
        if (val < 0 || val >= tgtMaxVal) {
            fail("Found value (" + val + ") outside target range [0, " + tgtMaxVal + "]");
        }
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 50 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirUnionTest method accumulateTest.

@Test
public void accumulateTest() {
    try {
        final long n = 20;
        final int k = 64;
        final Tuple reservoir1 = TupleFactory.getInstance().newTuple(3);
        reservoir1.set(0, n);
        reservoir1.set(1, k);
        reservoir1.set(2, ReservoirSamplingTest.generateDataBag(n, 0));
        final Tuple reservoir2 = TupleFactory.getInstance().newTuple(3);
        reservoir2.set(0, n);
        reservoir2.set(1, k);
        reservoir2.set(2, ReservoirSamplingTest.generateDataBag(n, (int) n));
        final Tuple reservoir3 = TupleFactory.getInstance().newTuple(3);
        reservoir3.set(0, n);
        reservoir3.set(1, k);
        reservoir3.set(2, ReservoirSamplingTest.generateDataBag(n, (int) (2 * n)));
        final DataBag bag1 = BagFactory.getInstance().newDefaultBag();
        bag1.add(reservoir1);
        bag1.add(reservoir2);
        final Tuple input1 = TupleFactory.getInstance().newTuple(bag1);
        final DataBag bag2 = BagFactory.getInstance().newDefaultBag();
        bag2.add(reservoir3);
        final Tuple input2 = TupleFactory.getInstance().newTuple(bag2);
        final ReservoirUnion ru = new ReservoirUnion(Integer.toString(k));
        ru.accumulate(input1);
        ru.accumulate(input2);
        final Tuple result = ru.getValue();
        // assuming k >= 3n so all items still in reservoir, in order
        assertEquals(result.size(), 3, "Unexpected tuple size from UDF");
        assertEquals((long) result.get(0), 3 * n, "Incorrect total number of items seen");
        assertEquals((int) result.get(1), k, "Unexpected value of k");
        final DataBag outputSamples = (DataBag) result.get(2);
        assertEquals(outputSamples.size(), (long) result.get(0), "Output reservoir size does not match reported number of items");
        int i = 0;
        for (Tuple t : outputSamples) {
            // expected format: (i:int, -i:chararray)
            assertEquals((int) t.get(0), i);
            assertEquals((String) t.get(1), Integer.toString(-i));
            ++i;
        }
        ru.cleanup();
        assertNull(ru.getValue());
    } catch (final IOException e) {
        fail("Unexpected exception");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) IOException(java.io.IOException) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10