Search in sources :

Example 81 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToSketch method exec.

// @formatter:off
/**
 ***********************************************************************************************
 * Top-level exec function.
 * This method accepts an input Tuple containing a Bag of one or more inner <b>Datum Tuples</b>
 * and returns a single updated <b>Sketch</b> as a <b>Sketch Tuple</b>.
 *
 * <p>If a large number of calls is anticipated, leveraging either the <i>Algebraic</i> or
 * <i>Accumulator</i> interfaces is recommended. Pig normally handles this automatically.
 *
 * <p>Internally, this method presents the inner <b>Datum Tuples</b> to a new <b>Sketch</b>,
 * which is returned as a <b>Sketch Tuple</b>
 *
 * <p><b>Input Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Must contain only one field)
 *     <ul>
 *       <li>index 0: DataBag: BAG (May contain 0 or more Inner Tuples)
 *         <ul>
 *           <li>index 0: Tuple: TUPLE <b>Datum Tuple</b></li>
 *           <li>...</li>
 *           <li>index n-1: Tuple: TUPLE <b>Datum Tuple</b></li>
 *         </ul>
 *       </li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * <b>Datum Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Must contain only one field)
 *     <ul>
 *       <li>index 0: Java data type : Pig DataType: may be any one of:
 *         <ul>
 *           <li>Byte: BYTE</li>
 *           <li>Integer: INTEGER</li>
 *           <li>Long: LONG</li>
 *           <li>Float: FLOAT</li>
 *           <li>Double: DOUBLE</li>
 *           <li>String: CHARARRAY</li>
 *           <li>DataByteArray: BYTEARRAY</li>
 *         </ul>
 *       </li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * <b>Sketch Tuple</b>
 * <ul>
 *   <li>Tuple: TUPLE (Contains exactly 1 field)
 *     <ul>
 *       <li>index 0: DataByteArray: BYTEARRAY = The serialization of a Sketch object.</li>
 *     </ul>
 *   </li>
 * </ul>
 *
 * @param inputTuple A tuple containing a single bag, containing Datum Tuples.
 * @return Sketch Tuple. If inputTuple is null or empty, returns empty sketch (8 bytes).
 * @see "org.apache.pig.EvalFunc.exec(org.apache.pig.data.Tuple)"
 * @throws IOException from Pig.
 */
// @formatter:on
// TOP LEVEL EXEC
@Override
public Tuple exec(final Tuple inputTuple) throws IOException {
    // throws is in API
    // The exec is a stateless function.  It operates on the input and returns a result.
    // It can only call static functions.
    final Union union = newUnion(nomEntries_, p_, seed_);
    final DataBag bag = extractBag(inputTuple);
    if (bag == null) {
        // Configured with parent
        return emptyCompactOrderedSketchTuple_;
    }
    // updates union with all elements of the bag
    updateUnion(bag, union);
    final CompactSketch compOrdSketch = union.getResult(true, null);
    return compactOrderedSketchToTuple(compOrdSketch);
}
Also used : CompactSketch(com.yahoo.sketches.theta.CompactSketch) DataBag(org.apache.pig.data.DataBag) Union(com.yahoo.sketches.theta.Union)

Example 82 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToFrequentStringsSketchTest method algebraicIntermediateFinalWrongType.

@Test(expectedExceptions = IllegalArgumentException.class)
public void algebraicIntermediateFinalWrongType() throws Exception {
    EvalFunc<Tuple> func = new DataToFrequentStringsSketch.IntermediateFinal("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    // this bag must have tuples with either bags or data byte arrays
    bag.add(TupleFactory.getInstance().newTuple(1.0));
    func.exec(TupleFactory.getInstance().newTuple(bag));
}
Also used : DataBag(org.apache.pig.data.DataBag) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 83 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToFrequentStringsSketchTest method exec.

@Test
public void exec() throws Exception {
    EvalFunc<Tuple> func = new DataToFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple("a"));
    bag.add(PigUtil.objectsToTuple("b", 5L));
    bag.add(PigUtil.objectsToTuple("a", 2L));
    bag.add(PigUtil.objectsToTuple("b"));
    Tuple inputTuple = PigUtil.objectsToTuple(bag);
    Tuple resultTuple = func.exec(inputTuple);
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertEquals(sketch.getNumActiveItems(), 2);
    Assert.assertEquals(sketch.getEstimate("a"), 3);
    Assert.assertEquals(sketch.getEstimate("b"), 6);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 84 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToFrequentStringsSketchTest method execWrongSizeOfInnerTuple.

@Test(expectedExceptions = IllegalArgumentException.class)
public void execWrongSizeOfInnerTuple() throws Exception {
    EvalFunc<Tuple> func = new DataToFrequentStringsSketch("8");
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple());
    Tuple inputTuple = PigUtil.objectsToTuple(bag);
    func.exec(inputTuple);
}
Also used : DataBag(org.apache.pig.data.DataBag) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Example 85 with DataBag

use of org.apache.pig.data.DataBag in project sketches-pig by DataSketches.

the class DataToFrequentStringsSketchTest method accumulator.

@Test
public void accumulator() throws Exception {
    Accumulator<Tuple> func = new DataToFrequentStringsSketch("8");
    Tuple inputTuple = TupleFactory.getInstance().newTuple(1);
    DataBag bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple("a"));
    inputTuple.set(0, bag);
    func.accumulate(inputTuple);
    inputTuple = TupleFactory.getInstance().newTuple(1);
    bag = BagFactory.getInstance().newDefaultBag();
    bag.add(PigUtil.objectsToTuple("b"));
    bag.add(PigUtil.objectsToTuple("a", 2L));
    bag.add(PigUtil.objectsToTuple("b", 5L));
    inputTuple.set(0, bag);
    func.accumulate(inputTuple);
    Tuple resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    DataByteArray bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertEquals(sketch.getNumActiveItems(), 2);
    Assert.assertEquals(sketch.getEstimate("a"), 3);
    Assert.assertEquals(sketch.getEstimate("b"), 6);
    // after cleanup, the value should always be 0
    func.cleanup();
    resultTuple = func.getValue();
    Assert.assertNotNull(resultTuple);
    Assert.assertEquals(resultTuple.size(), 1);
    bytes = (DataByteArray) resultTuple.get(0);
    Assert.assertTrue(bytes.size() > 0);
    ItemsSketch<String> sketch2 = ItemsSketch.getInstance(Memory.wrap(bytes.get()), new ArrayOfStringsSerDe());
    Assert.assertTrue(sketch2.isEmpty());
    Assert.assertEquals(sketch2.getNumActiveItems(), 0);
}
Also used : ArrayOfStringsSerDe(com.yahoo.sketches.ArrayOfStringsSerDe) DataBag(org.apache.pig.data.DataBag) DataByteArray(org.apache.pig.data.DataByteArray) Tuple(org.apache.pig.data.Tuple) Test(org.testng.annotations.Test)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10