Search in sources :

Example 36 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToItemsSketch method exec.

// @formatter:off
/**
 * Top-level exec function.
 * This method accepts an input Tuple containing a Bag of one or more inner <b>Datum Tuples</b>
 * and returns a single <b>Sketch</b> as a <b>Sketch Tuple</b>.
 *
 * <p>If a large number of calls is anticipated, leveraging either the <i>Algebraic</i> or
 * <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
 *
 * <p>Internally, this method presents the inner <b>Datum Tuples</b> to a new <b>Union</b>,
 * which is returned as a <b>Sketch Tuple</b>
 *
 * <p>Types below are in the form: Java data type: Pig DataType
 *
 * <p><b>Input Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Must contain only one field)
 *     <ul>
 *       <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
 *         <ul>
 *           <li>index 0: Tuple: TUPLE <b>Datum Tuple</b></li>
 *           <li>...</li>
 *           <li>index n-1: Tuple: TUPLE <b>Datum Tuple</b></li>
 *         </ul>
 *       </li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * <b>Datum Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Must contain only one field)
 *     <ul>
 *       <li>index 0: T: some suitable Pig type convertible to T</li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * <b>Sketch Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Contains exactly 1 field)
 *     <ul>
 *       <li>index 0: DataByteArray: BYTEARRAY = a serialized QuantilesSketch object.</li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * @param inputTuple A tuple containing a single bag, containing Datum Tuples.
 * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch.
 * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
 * @throws IOException from Pig.
 */
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    // The exec is a stateless function. It operates on the input and returns a result.
    if (inputTuple != null && inputTuple.size() > 0) {
        final ItemsUnion<T> union = k_ > 0 ? ItemsUnion.getInstance(k_, comparator_) : ItemsUnion.getInstance(comparator_);
        final DataBag bag = (DataBag) inputTuple.get(0);
        for (final Tuple innerTuple : bag) {
            union.update(extractValue(innerTuple.get(0)));
        }
        final ItemsSketch<T> resultSketch = union.getResultAndReset();
        if (resultSketch != null) {
            return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(serDe_)));
        }
    }
    // return empty sketch
    final ItemsSketch<T> sketch = k_ > 0 ? ItemsSketch.getInstance(k_, comparator_) : ItemsSketch.getInstance(comparator_);
    return tupleFactory_.newTuple(new DataByteArray(sketch.toByteArray(serDe_)));
}
Also used : DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 37 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class FrequentStringsSketchToEstimates method exec.

@Override
public DataBag exec(final Tuple input) throws IOException {
    if ((input == null) || (input.size() == 0)) {
        return null;
    }
    final DataByteArray dba = (DataByteArray) input.get(0);
    final ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(dba.get()), new ArrayOfStringsSerDe());
    final ItemsSketch.Row<String>[] result = sketch.getFrequentItems(errorType);
    final DataBag bag = BagFactory.getInstance().newDefaultBag();
    for (int i = 0; i < result.length; i++) {
        final Tuple tuple = TupleFactory.getInstance().newTuple(4);
        tuple.set(0, result[i].getItem());
        tuple.set(1, result[i].getEstimate());
        tuple.set(2, result[i].getLowerBound());
        tuple.set(3, result[i].getUpperBound());
        bag.add(tuple);
    }
    return bag;
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 38 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToFrequentItemsSketch method accumulate.

@Override
public void accumulate(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("accumulate was used");
        isFirstCall_ = false;
    }
    if (accumSketch_ == null) {
        accumSketch_ = new ItemsSketch<T>(sketchSize_);
    }
    if (inputTuple.size() != 1) {
        throw new IllegalArgumentException("Input tuple must have 1 bag");
    }
    final DataBag bag = (DataBag) inputTuple.get(0);
    updateSketch(bag, accumSketch_);
}
Also used : DataBag(org.apache.pig.data.DataBag)

Example 39 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToFrequentItemsSketchAlgebraicIntermediateFinal method exec.

@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("algebraic was used");
        isFirstCall_ = false;
    }
    final ItemsSketch<T> sketch = new ItemsSketch<T>(sketchSize_);
    final DataBag bag = (DataBag) inputTuple.get(0);
    for (Tuple dataTuple : bag) {
        final Object item = dataTuple.get(0);
        if (item instanceof DataBag) {
            // this is a bag from the Initial function.
            // just insert each item of the tuple into the sketch
            DataToFrequentItemsSketch.updateSketch((DataBag) item, sketch);
        } else if (item instanceof DataByteArray) {
            // This is a sketch from a prior call to the
            // Intermediate function. merge it with the
            // current sketch.
            final ItemsSketch<T> incomingSketch = Util.deserializeSketchFromTuple(dataTuple, serDe_);
            sketch.merge(incomingSketch);
        } else {
            // we should never get here.
            throw new IllegalArgumentException("InputTuple.Field0: Bag contains unrecognized types: " + item.getClass().getName());
        }
    }
    return Util.serializeSketchToTuple(sketch, serDe_);
}
Also used : DataBag(org.apache.pig.data.DataBag) ItemsSketch(com.yahoo.sketches.frequencies.ItemsSketch) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple)

Example 40 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class ReservoirSamplingTest method execTest.

@Test
public void execTest() throws IOException {
    // copies tests for accumulate() since that handles both data paths
    final int k = 32;
    final long n = 24;
    final DataBag inputBag = BagFactory.getInstance().newDefaultBag();
    final TupleFactory tf = TupleFactory.getInstance();
    for (long i = 0; i < n; ++i) {
        final Tuple t = tf.newTuple(2);
        t.set(0, i);
        t.set(1, Long.toString(-i));
        inputBag.add(t);
    }
    final Tuple input = tf.newTuple(inputBag);
    final ReservoirSampling rs = new ReservoirSampling(Integer.toString(k));
    Tuple result = rs.exec(input);
    assertEquals(result.size(), 3, "Incorrect output size");
    assertEquals(result.get(0), n, "Incorrect number of samples seen");
    assertEquals(result.get(1), k, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), n);
    // add another n to the bag and repeat
    for (long i = n; i < 2 * n; ++i) {
        final Tuple t = tf.newTuple(2);
        t.set(0, i);
        t.set(1, Long.toString(-i));
        inputBag.add(t);
    }
    result = rs.exec(input);
    assertEquals(result.get(0), 2 * n, "Incorrect number of samples seen");
    // unchanged
    assertEquals(result.get(1), k, "Incorrect value of k");
    assertEquals(((DataBag) result.get(2)).size(), Math.min(k, 2 * n));
}
Also used : DataBag(org.apache.pig.data.DataBag) TupleFactory(org.apache.pig.data.TupleFactory) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10