Search in sources :

Example 76 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToStringsSketchTest method execNormalCase.

@Test
public void execNormalCase() throws Exception {
    EvalFunc<Tuple> func = new DataToStringsSketch();
    DataBag bag = BAG_FACTORY.newDefaultBag();
    bag.add(TUPLE_FACTORY.newTuple("a"));
    Tuple resultTuple = func.exec(TUPLE_FACTORY.newTuple(bag));
    ItemsSketch<String> sketch = getSketch(resultTuple);
    Assert.assertFalse(sketch.isEmpty());
    Assert.assertEquals(sketch.getN(), 1);
}
Also used : DataBag(org.apache.pig.data.DataBag) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 77 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirUnion method accumulate.

// We could overload exec() for easy cases, but we still need to compare the incoming
// reservoir's k vs max k and possibly downsample.
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
    if (inputTuple == null || inputTuple.size() < 1 || inputTuple.isNull(0)) {
        return;
    }
    final DataBag reservoirs = (DataBag) inputTuple.get(0);
    if (union_ == null) {
        union_ = ReservoirItemsUnion.newInstance(maxK_);
    }
    try {
        for (Tuple t : reservoirs) {
            // if t == null or t.size() < 3, we'll throw an exception
            final long n = (long) t.get(0);
            final int k = (int) t.get(1);
            final DataBag sampleBag = (DataBag) t.get(2);
            final ArrayList<Tuple> samples = ReservoirSampling.dataBagToArrayList(sampleBag);
            union_.update(n, k, samples);
        }
    } catch (final IndexOutOfBoundsException e) {
        throw new ExecException("Cannot update union with given reservoir", e);
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ExecException(org.apache.pig.backend.executionengine.ExecException) Tuple(org.apache.pig.data.Tuple)

Example 78 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirUnion method getValue.

@Override
public Tuple getValue() {
    if (union_ == null) {
        return null;
    }
    // newDefaultBag(List<Tuple>) does *not* copy values
    final ReservoirItemsSketch<Tuple> resultSketch = union_.getResult();
    final List<Tuple> data = SamplingPigUtil.getRawSamplesAsList(resultSketch);
    final DataBag sampleBag = BagFactory.getInstance().newDefaultBag(data);
    return ReservoirSampling.createResultTuple(resultSketch.getN(), resultSketch.getK(), sampleBag);
}
Also used : DataBag(org.apache.pig.data.DataBag) Tuple(org.apache.pig.data.Tuple)

Example 79 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class VarOptUnion method accumulate.

// We could overload exec() for easy cases, but we still need to compare the incoming
// reservoir's k vs max k and possibly downsample.
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
    if (inputTuple == null || inputTuple.size() < 1 || inputTuple.isNull(0)) {
        return;
    }
    final DataBag sketches = (DataBag) inputTuple.get(0);
    if (union_ == null) {
        union_ = VarOptItemsUnion.newInstance(maxK_);
    }
    for (Tuple t : sketches) {
        final DataByteArray dba = (DataByteArray) t.get(0);
        final Memory sketch = Memory.wrap(dba.get());
        union_.update(sketch, SERDE);
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) Memory(com.yahoo.memory.Memory) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 80 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToSketch method accumulate.

// ACCUMULATOR INTERFACE
/**
 ***********************************************************************************************
 * An <i>Accumulator</i> version of the standard <i>exec()</i> method. Like <i>exec()</i>,
 * accumulator is called with a bag of Datum Tuples. Unlike <i>exec()</i>, it doesn't serialize the
 * sketch at the end. Instead, it can be called multiple times, each time with another bag of
 * Datum Tuples to be input to the sketch.
 *
 * @param inputTuple A tuple containing a single bag, containing Datum Tuples.
 * @see #exec
 * @see "org.apache.pig.Accumulator.accumulate(org.apache.pig.data.Tuple)"
 * @throws IOException by Pig
 */
@Override
public void accumulate(final Tuple inputTuple) throws IOException {
    // throws is in API
    if (accumUnion_ == null) {
        accumUnion_ = DataToSketch.newUnion(nomEntries_, p_, seed_);
    }
    final DataBag bag = extractBag(inputTuple);
    if (bag == null) {
        return;
    }
    updateUnion(bag, accumUnion_);
}
Also used : DataBag(org.apache.pig.data.DataBag)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10