Search in sources :

Example 96 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToArrayOfDoublesSketchBase method exec.

@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("exec is used");
        isFirstCall_ = false;
    }
    if ((inputTuple == null) || (inputTuple.size() == 0)) {
        return null;
    }
    if (inputTuple.size() != 1) {
        throw new IllegalArgumentException("Input tuple must have 1 bag");
    }
    final ArrayOfDoublesUpdatableSketch sketch = new ArrayOfDoublesUpdatableSketchBuilder().setNominalEntries(sketchSize_).setSamplingProbability(samplingProbability_).setNumberOfValues(numValues_).build();
    final DataBag bag = (DataBag) inputTuple.get(0);
    updateSketch(bag, sketch, numValues_);
    return Util.tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()));
}
Also used : ArrayOfDoublesUpdatableSketchBuilder(com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder) ArrayOfDoublesUpdatableSketch(com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray)

Example 97 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToSketch method accumulate.

@Override
public void accumulate(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("accumulate is used");
        isFirstCall_ = false;
    }
    if (accumSketch_ == null) {
        accumSketch_ = sketchBuilder_.build();
    }
    if (inputTuple.size() != 1) {
        throw new IllegalArgumentException("Input tuple must have 1 bag");
    }
    final DataBag bag = (DataBag) inputTuple.get(0);
    updateSketch(bag, accumSketch_);
}
Also used : DataBag(org.apache.pig.data.DataBag)

Example 98 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToSketch method exec.

@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("exec is used");
        isFirstCall_ = false;
    }
    if ((inputTuple == null) || (inputTuple.size() == 0)) {
        return null;
    }
    if (inputTuple.size() != 1) {
        throw new IllegalArgumentException("Input tuple must have 1 bag");
    }
    final UpdatableSketch<U, S> sketch = sketchBuilder_.build();
    final DataBag bag = (DataBag) inputTuple.get(0);
    updateSketch(bag, sketch);
    return Util.tupleFactory.newTuple(new DataByteArray(sketch.compact().toByteArray()));
}
Also used : DEFAULT_NOMINAL_ENTRIES(com.yahoo.sketches.Util.DEFAULT_NOMINAL_ENTRIES) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray)

Example 99 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToSketchAlgebraicIntermediateFinal method exec.

@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    if (isFirstCall_) {
        // this is to see in the log which way was used by Pig
        Logger.getLogger(getClass()).info("algebraic is used");
        isFirstCall_ = false;
    }
    final Union<S> union = new Union<S>(sketchSize_, summarySetOps_);
    final DataBag bag = (DataBag) inputTuple.get(0);
    if (bag == null) {
        throw new IllegalArgumentException("InputTuple.Field0: Bag may not be null");
    }
    for (final Tuple dataTuple : bag) {
        final Object item = dataTuple.get(0);
        if (item instanceof DataBag) {
            // this is a bag from the Initial function.
            // just insert each item of the tuple into the sketch
            final UpdatableSketch<U, S> sketch = sketchBuilder_.build();
            DataToSketch.updateSketch((DataBag) item, sketch);
            union.update(sketch);
        } else if (item instanceof DataByteArray) {
            // This is a sketch from a prior call to the
            // Intermediate function. merge it with the
            // current sketch.
            final Sketch<S> incomingSketch = Util.deserializeSketchFromTuple(dataTuple, summaryDeserializer_);
            union.update(incomingSketch);
        } else {
            // we should never get here.
            throw new IllegalArgumentException("InputTuple.Field0: Bag contains unrecognized types: " + item.getClass().getName());
        }
    }
    return Util.tupleFactory.newTuple(new DataByteArray(union.getResult().toByteArray()));
}
Also used : DEFAULT_NOMINAL_ENTRIES(com.yahoo.sketches.Util.DEFAULT_NOMINAL_ENTRIES) DataBag(org.apache.pig.data.DataBag) Sketch(com.yahoo.sketches.tuple.Sketch) UpdatableSketch(com.yahoo.sketches.tuple.UpdatableSketch) DataByteArray(org.apache.pig.data.DataByteArray) Union(com.yahoo.sketches.tuple.Union) Tuple(org.apache.pig.data.Tuple)

Example 100 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class UnionDoublesSketch method exec.

// @formatter:off
/**
 * Top-level exec function.
 * This method accepts an input Tuple containing a Bag of one or more inner <b>Sketch Tuples</b>
 * and returns a single updated <b>Sketch</b> as a <b>Sketch Tuple</b>.
 *
 * <p>If a large number of calls are anticipated, leveraging either the <i>Algebraic</i> or
 * <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
 *
 * <p>Internally, this method presents the inner <b>Sketch Tuples</b> to a new <b>Union</b>.
 * The result is returned as a <b>Sketch Tuple</b>
 *
 * <p>Types are in the form: Java data type: Pig DataType
 *
 * <p><b>Input Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Must contain only one field)
 *     <ul>
 *       <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
 *         <ul>
 *           <li>index 0: Tuple: TUPLE <b>Sketch Tuple</b></li>
 *           <li>...</li>
 *           <li>index n-1: Tuple: TUPLE <b>Sketch Tuple</b></li>
 *         </ul>
 *       </li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * <b>Sketch Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Contains exactly 1 field)
 *     <ul>
 *       <li>index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.</li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * @param inputTuple A tuple containing a single bag, containing Sketch Tuples.
 * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch.
 * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
 */
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    // The exec is a stateless function.  It operates on the input and returns a result.
    if (inputTuple != null && inputTuple.size() > 0) {
        final DoublesUnion union = unionBuilder_.build();
        final DataBag bag = (DataBag) inputTuple.get(0);
        updateUnion(bag, union);
        final DoublesSketch resultSketch = union.getResultAndReset();
        if (resultSketch != null) {
            return tupleFactory_.newTuple(new DataByteArray(resultSketch.toByteArray(true)));
        }
    }
    // return empty sketch
    return tupleFactory_.newTuple(new DataByteArray(unionBuilder_.build().getResult().toByteArray(true)));
}
Also used : DoublesSketch(com.yahoo.sketches.quantiles.DoublesSketch) DoublesUnion(com.yahoo.sketches.quantiles.DoublesUnion) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10