Search in sources :

Example 51 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirUnionTest method checkDegenerateInput.

@Test
public void checkDegenerateInput() {
    // using default max k value
    final ReservoirUnion ru = new ReservoirUnion();
    Tuple inputTuple;
    try {
        // input == null
        assertNull(ru.exec(null));
        // input.size() < 1
        inputTuple = TupleFactory.getInstance().newTuple(0);
        assertNull(ru.exec(inputTuple));
        // input.isNull(0);
        inputTuple = TupleFactory.getInstance().newTuple(1);
        inputTuple.set(0, null);
        assertNull(ru.exec(inputTuple));
    } catch (final IOException e) {
        fail("Unexpected exception");
    }
    try {
        // reservoir tuple with only 2 entries
        final Tuple reservoir = TupleFactory.getInstance().newTuple(2);
        reservoir.set(0, 256L);
        reservoir.set(1, 256);
        final DataBag reservoirBag = BagFactory.getInstance().newDefaultBag();
        reservoirBag.add(reservoir);
        inputTuple = TupleFactory.getInstance().newTuple(reservoirBag);
        ru.exec(inputTuple);
        fail("Did not catch expected ExecException");
    } catch (final ExecException e) {
    // expected
    } catch (final IOException e) {
        fail("Unexpected exception");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) IOException(java.io.IOException) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 52 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class VarOptCommonAlgebraicTest method unionSketchesExec.

@Test
public void unionSketchesExec() {
    // Only difference between UnionSketchesAsTuple and UnionSketchesAsByteArray is that one wraps
    // the resulting serialized sketch in a tuple. If the union result is still in exact mode, the
    // two sketches should be identical.
    final int numSketches = 3;
    // numSketches * numItemsPerSketch should be < k here
    final int numItemsPerSketch = 10;
    final int k = 100;
    final String kStr = Integer.toString(k);
    final VarOptCommonImpl.UnionSketchesAsTuple udfTuple;
    final VarOptCommonImpl.UnionSketchesAsByteArray udfBA;
    udfTuple = new VarOptCommonImpl.UnionSketchesAsTuple(kStr);
    udfBA = new VarOptCommonImpl.UnionSketchesAsByteArray(kStr);
    char id = 'a';
    double wt = 1.0;
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    final VarOptItemsUnion<Tuple> union = VarOptItemsUnion.newInstance(k);
    final VarOptItemsSketch<Tuple> vis = VarOptItemsSketch.newInstance(k);
    // compare against at the end.
    try {
        for (int j = 0; j < numSketches; ++j) {
            vis.reset();
            for (int i = 0; i < numItemsPerSketch; ++i) {
                final Tuple t = TupleFactory.getInstance().newTuple(2);
                t.set(0, Character.toString(id));
                t.set(1, wt);
                vis.update(t, wt);
                ++id;
                wt += 1.0;
            }
            final Tuple wrapper = TupleFactory.getInstance().newTuple(1);
            wrapper.set(0, new DataByteArray(vis.toByteArray(serDe_)));
            inputBag.add(wrapper);
            union.update(vis);
        }
    } catch (final ExecException e) {
        fail("Unexpected ExecException creating input data");
    }
    try {
        final Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
        inputTuple.set(0, inputBag);
        final DataByteArray outArray = udfBA.exec(inputTuple);
        final VarOptItemsSketch<Tuple> sketch1 = VarOptItemsSketch.heapify(Memory.wrap(outArray.get()), serDe_);
        final Tuple outTuple = udfTuple.exec(inputTuple);
        final DataByteArray dba = (DataByteArray) outTuple.get(0);
        final VarOptItemsSketch<Tuple> sketch2 = VarOptItemsSketch.heapify(Memory.wrap(dba.get()), serDe_);
        final VarOptItemsSketch<Tuple> expectedResult = union.getResult();
        compareResults(sketch1, expectedResult);
        compareResults(sketch2, expectedResult);
    } catch (final IOException e) {
        fail("Unexpected IOException calling exec()");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) IOException(java.io.IOException) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 53 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class VarOptSamplingTest method algebraicFinal.

@Test
public void algebraicFinal() {
    final int k = 87;
    final int wtIdx = 2;
    final VarOptSampling.Final udf = new VarOptSampling.Final(Integer.toString(k), Integer.toString(wtIdx));
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    final VarOptItemsSketch<Tuple> vis = VarOptItemsSketch.newInstance(k);
    inputBag.add(TupleFactory.getInstance().newTuple(new DataByteArray(vis.toByteArray(serDe_))));
    final Tuple inputTuple = TupleFactory.getInstance().newTuple(inputBag);
    try {
        final DataBag result = udf.exec(inputTuple);
        assertNotNull(result);
        assertEquals(result.size(), 0);
    } catch (final IOException e) {
        fail("Unexpected IOException");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) IOException(java.io.IOException) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 54 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class VarOptSamplingTest method standardAccumulate.

@Test
public void standardAccumulate() {
    final int k = 10;
    final VarOptSampling udf = new VarOptSampling(Integer.toString(k), "0");
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    double cumWeight = 0.0;
    try {
        for (int i = 1; i < k; ++i) {
            final Tuple t = TupleFactory.getInstance().newTuple(3);
            t.set(0, 1.0 * i);
            t.set(1, i);
            t.set(2, -i);
            inputBag.add(t);
            cumWeight += i;
        }
        final Tuple inputTuple = TupleFactory.getInstance().newTuple(inputBag);
        assertNull(udf.getValue());
        udf.accumulate(inputTuple);
        udf.accumulate(inputTuple);
        final DataBag result = udf.getValue();
        udf.cleanup();
        assertNull(udf.getValue());
        assertNotNull(result);
        assertEquals(result.size(), k);
        double cumResultWeight = 0.0;
        for (Tuple weightAndtuple : result) {
            cumResultWeight += (double) weightAndtuple.get(0);
            final Tuple sample = (Tuple) weightAndtuple.get(1);
            assertEquals(sample.size(), 3);
            final int id = (int) sample.get(1);
            assertTrue(id > 0 && id < k);
        }
        // called accumulate() twice
        assertEquals(cumResultWeight, 2 * cumWeight, EPS);
    } catch (final IOException e) {
        fail("Unexpected exception");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) IOException(java.io.IOException) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 55 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class VarOptUnionTest method checkExecution.

@Test
public void checkExecution() {
    final int k = 5;
    final VarOptUnion udf = new VarOptUnion(Integer.toString(k));
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    final Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
    try {
        final VarOptItemsSketch<Tuple> sketch = VarOptItemsSketch.newInstance(k);
        final VarOptItemsUnion<Tuple> union = VarOptItemsUnion.newInstance(k);
        for (int i = 1; i < k / 2; ++i) {
            sketch.reset();
            final Tuple t = TupleFactory.getInstance().newTuple(3);
            t.set(0, 1.0 * i);
            t.set(1, i);
            t.set(2, -i);
            sketch.update(t, 1.0 * i);
            // serialize sketch and wrap in Tuple, add to both bag and union
            final Tuple sketchWrapper = TupleFactory.getInstance().newTuple(1);
            final DataByteArray dba = new DataByteArray(sketch.toByteArray(new ArrayOfTuplesSerDe()));
            sketchWrapper.set(0, dba);
            inputBag.add(sketchWrapper);
            union.update(sketch);
            // calling accumulate() twice later
            union.update(sketch);
        }
        inputTuple.set(0, inputBag);
        assertNull(udf.getValue());
        udf.accumulate(inputTuple);
        udf.accumulate(inputTuple);
        final DataByteArray outBytes = udf.getValue();
        udf.cleanup();
        assertNull(udf.getValue());
        final VarOptItemsSketch<Tuple> result = VarOptItemsSketch.heapify(Memory.wrap(outBytes.get()), new ArrayOfTuplesSerDe());
        assertNotNull(result);
        VarOptCommonAlgebraicTest.compareResults(result, union.getResult());
    } catch (final IOException e) {
        fail("Unexpected exception");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) IOException(java.io.IOException) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10