Search in sources :

Example 1 with VarOptItemsSamples

use of org.apache.datasketches.sampling.VarOptItemsSamples in project sketches-pig by DataSketches.

the class VarOptCommonImpl method createDataBagFromSketch.

// Produces a DataBag containing the samples from the input sketch
static DataBag createDataBagFromSketch(final VarOptItemsSketch<Tuple> sketch) {
    final DataBag output = BAG_FACTORY.newDefaultBag();
    final VarOptItemsSamples<Tuple> samples = sketch.getSketchSamples();
    try {
        // create (weight, item) tuples to add to output bag
        for (final VarOptItemsSamples<Tuple>.WeightedSample ws : samples) {
            final Tuple weightedSample = TUPLE_FACTORY.newTuple(2);
            weightedSample.set(0, ws.getWeight());
            weightedSample.set(1, ws.getItem());
            output.add(weightedSample);
        }
    } catch (final ExecException e) {
        throw new RuntimeException("Pig error: " + e.getMessage(), e);
    }
    return output;
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) VarOptItemsSamples(org.apache.datasketches.sampling.VarOptItemsSamples) Tuple(org.apache.pig.data.Tuple)

Example 2 with VarOptItemsSamples

use of org.apache.datasketches.sampling.VarOptItemsSamples in project sketches-pig by DataSketches.

the class VarOptCommonAlgebraicTest method rawTuplesToSketchTupleExec.

// exec: sketches generally in sampling mode
@SuppressWarnings("unused")
@Test
public void rawTuplesToSketchTupleExec() {
    final int k = 5;
    final int wtIdx = 1;
    final VarOptCommonImpl.RawTuplesToSketchTuple udf;
    udf = new VarOptCommonImpl.RawTuplesToSketchTuple(Integer.toString(k), Integer.toString(wtIdx));
    char id = 'a';
    double wt = 1.0;
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    try {
        for (int i = 0; i < k + 1; ++i) {
            final Tuple t = TupleFactory.getInstance().newTuple(2);
            t.set(0, Character.toString(id));
            t.set(1, wt);
            inputBag.add(t);
            ++id;
            wt += 1.0;
        }
    } catch (final ExecException e) {
        fail("Unexpected ExecException creating input data");
    }
    try {
        // degenerate input first
        Tuple result = udf.exec(null);
        assertNull(result);
        Tuple inputTuple = TupleFactory.getInstance().newTuple(0);
        result = udf.exec(inputTuple);
        assertNull(result);
        inputTuple = TupleFactory.getInstance().newTuple(1);
        inputTuple.set(0, null);
        result = udf.exec(inputTuple);
        assertNull(result);
        // now test real input
        inputTuple.set(0, inputBag);
        result = udf.exec(inputTuple);
        assertEquals(result.size(), 1);
        final DataByteArray dba = (DataByteArray) result.get(0);
        final VarOptItemsSketch<Tuple> vis;
        vis = VarOptItemsSketch.heapify(Memory.wrap(dba.get()), serDe_);
        assertEquals(vis.getN(), k + 1);
        assertEquals(vis.getK(), k);
        // just validating the original weights are within the expected range
        for (VarOptItemsSamples<Tuple>.WeightedSample ws : vis.getSketchSamples()) {
            final Tuple t = ws.getItem();
            assertTrue((double) t.get(wtIdx) >= 1.0);
            assertTrue((double) t.get(wtIdx) <= k + 1.0);
        }
    } catch (final IOException e) {
        fail("Unexpected IOException calling exec()");
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) VarOptItemsSamples(org.apache.datasketches.sampling.VarOptItemsSamples) IOException(java.io.IOException) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

VarOptItemsSamples (org.apache.datasketches.sampling.VarOptItemsSamples)2 ExecException (org.apache.pig.backend.executionengine.ExecException)2 DataBag (org.apache.pig.data.DataBag)2 Tuple (org.apache.pig.data.Tuple)2 IOException (java.io.IOException)1 DataByteArray (org.apache.pig.data.DataByteArray)1 Test (org.testng.annotations.Test)1