Search in sources :

Example 1 with DataBag

use of org.apache.pig.data.DataBag in project elephant-bird by twitter.

the class Fixtures method buildPersonTuple.

public static Tuple buildPersonTuple() throws ExecException {
    DataBag phoneBag = new NonSpillableDataBag(Lists.newArrayList(makePhoneNumberTuple("415-999-9999", null), makePhoneNumberTuple("415-666-6666", "MOBILE"), makePhoneNumberTuple("415-333-3333", "WORK")));
    Tuple entryTuple = tf_.newTuple(4);
    entryTuple.set(0, "Elephant Bird");
    entryTuple.set(1, 123);
    entryTuple.set(2, "elephant@bird.com");
    entryTuple.set(3, phoneBag);
    return entryTuple;
}
Also used : NonSpillableDataBag(org.apache.pig.data.NonSpillableDataBag) DataBag(org.apache.pig.data.DataBag) NonSpillableDataBag(org.apache.pig.data.NonSpillableDataBag) Tuple(org.apache.pig.data.Tuple)

Example 2 with DataBag

use of org.apache.pig.data.DataBag in project elephant-bird by twitter.

the class TestInvoker method testArrayConversion.

@Test
public void testArrayConversion() throws SecurityException, ClassNotFoundException, NoSuchMethodException, IOException {
    InvokeForInt id = new InvokeForInt(TestInvoker.class.getName() + ".avg", "double[]");
    DataBag nums = newSimpleBag(1.0, 2.0, 3.0);
    assertEquals(Integer.valueOf(2), id.exec(tf_.newTuple(nums)));
    InvokeForString is = new InvokeForString(TestInvoker.class.getName() + ".concatStringArray", "string[]");
    DataBag strings = newSimpleBag("foo", "bar", "baz");
    assertEquals("foobarbaz", is.exec(tf_.newTuple(strings)));
}
Also used : InvokeForString(com.twitter.elephantbird.pig.piggybank.InvokeForString) DataBag(org.apache.pig.data.DataBag) NonSpillableDataBag(org.apache.pig.data.NonSpillableDataBag) InvokeForInt(com.twitter.elephantbird.pig.piggybank.InvokeForInt) Test(org.junit.Test)

Example 3 with DataBag

use of org.apache.pig.data.DataBag in project elephant-bird by twitter.

the class ProtobufToPig method messageToTuple.

/**
 * Translate a nested message to a tuple.  If the field is repeated, it walks the list and adds each to a bag.
 * Otherwise, it just adds the given one.
 * @param fieldDescriptor the descriptor object for the given field.
 * @param fieldValue the object representing the value of this field, possibly null.
 * @return the object representing fieldValue in Pig -- either a bag or a tuple.
 */
@SuppressWarnings("unchecked")
protected Object messageToTuple(FieldDescriptor fieldDescriptor, Object fieldValue) {
    if (fieldValue == null) {
        // protobufs unofficially ensures values are not null. just in case:
        return null;
    }
    assert fieldDescriptor.getType() == FieldDescriptor.Type.MESSAGE : "messageToTuple called with field of type " + fieldDescriptor.getType();
    if (fieldDescriptor.isRepeated()) {
        // The protobuf contract is that if the field is repeated, then the object returned is actually a List
        // of the underlying datatype, which in this case is a nested message.
        List<Message> messageList = (List<Message>) (fieldValue != null ? fieldValue : Lists.newArrayList());
        DataBag bag = new NonSpillableDataBag(messageList.size());
        for (Message m : messageList) {
            bag.add(new ProtobufTuple(m));
        }
        return bag;
    } else {
        return new ProtobufTuple((Message) fieldValue);
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) NonSpillableDataBag(org.apache.pig.data.NonSpillableDataBag) Message(com.google.protobuf.Message) List(java.util.List) NonSpillableDataBag(org.apache.pig.data.NonSpillableDataBag)

Example 4 with DataBag

use of org.apache.pig.data.DataBag in project elephant-bird by twitter.

the class TestProtoToPig method testLazyProtoToPig.

@Test
public void testLazyProtoToPig() throws ExecException {
    Person personProto = Fixtures.buildPersonProto();
    Tuple protoTuple = new ProtobufTuple(personProto);
    Tuple normalTuple = Fixtures.buildPersonTuple();
    List<FieldDescriptor> fieldDescs = personProto.getDescriptorForType().getFields();
    TypeRef<Person> typeRef = PigUtil.getProtobufTypeRef(Person.class.getName());
    Tuple projectedTuple = new ProjectedProtobufTupleFactory<Person>(typeRef, evenFields(fieldDescs)).newTuple(personProto);
    int idx = 0;
    for (FieldDescriptor fd : fieldDescs) {
        // gives us non-null fields, which are not equal to the null fields...
        if (normalTuple.get(fd.getIndex()) instanceof DataBag) {
            continue;
        }
        assertEquals(protoTuple.get(fd.getIndex()), normalTuple.get(fd.getIndex()));
        if (idx % 2 == 0) {
            assertEquals(projectedTuple.get(fd.getIndex() / 2), normalTuple.get(fd.getIndex()));
        }
        idx++;
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ProtobufTuple(com.twitter.elephantbird.pig.util.ProtobufTuple) Person(com.twitter.data.proto.tutorial.AddressBookProtos.Person) ProtobufTuple(com.twitter.elephantbird.pig.util.ProtobufTuple) Tuple(org.apache.pig.data.Tuple) FieldDescriptor(com.google.protobuf.Descriptors.FieldDescriptor) Test(org.junit.Test)

Example 5 with DataBag

use of org.apache.pig.data.DataBag in project elephant-bird by twitter.

the class VectorWritableConverter method convertSparseVectorDataToVector.

private Vector convertSparseVectorDataToVector(Tuple value) throws IOException {
    Vector v;
    // determine output vector size and fetch bag containing entries from input
    int size = 0;
    DataBag entries = null;
    if (value.size() == 2) {
        // cardinality defined by input
        size = (Integer) value.get(0);
        if (cardinality != null) {
            // cardinality defined by VectorWritableConverter instance
            size = cardinality;
        }
        entries = (DataBag) value.get(1);
    } else {
        Preconditions.checkNotNull(cardinality, "Cardinality is undefined");
        size = cardinality;
        entries = (DataBag) value.get(0);
    }
    // create vector, allowing conversion of sparse input vector data to dense output vector
    if (dense) {
        // TODO(Andy Schlaikjer): Test for OOM before it happens
        v = new DenseVector(size);
    } else {
        // more efficient to build sparse vector with this impl
        v = new RandomAccessSparseVector(size);
    }
    // populate vector
    for (Tuple entry : entries) {
        validateSparseVectorEntryData(entry);
        int i = (Integer) entry.get(0);
        // check index bounds
        if (i < 0 || i >= size) {
            counterHelper.incrCounter(Counter.INDEX_OUT_OF_BOUNDS, 1);
            continue;
        }
        double n = ((Number) entry.get(1)).doubleValue();
        v.setQuick(i, n);
    }
    // convert to (sparse) sequential vector if requested
    if (sequential) {
        v = new SequentialAccessSparseVector(v);
    }
    return v;
}
Also used : RandomAccessSparseVector(org.apache.mahout.math.RandomAccessSparseVector) DataBag(org.apache.pig.data.DataBag) SequentialAccessSparseVector(org.apache.mahout.math.SequentialAccessSparseVector) DenseVector(org.apache.mahout.math.DenseVector) RandomAccessSparseVector(org.apache.mahout.math.RandomAccessSparseVector) Vector(org.apache.mahout.math.Vector) DenseVector(org.apache.mahout.math.DenseVector) Tuple(org.apache.pig.data.Tuple) SequentialAccessSparseVector(org.apache.mahout.math.SequentialAccessSparseVector)

Aggregations

DataBag (org.apache.pig.data.DataBag)266 Tuple (org.apache.pig.data.Tuple)223 Test (org.testng.annotations.Test)142 DataByteArray (org.apache.pig.data.DataByteArray)103 IOException (java.io.IOException)20 Estimate (com.yahoo.sketches.pig.theta.Estimate)19 EvalFunc (org.apache.pig.EvalFunc)16 HllSketch (com.yahoo.sketches.hll.HllSketch)14 DoubleSummary (com.yahoo.sketches.tuple.DoubleSummary)13 DoubleSummaryDeserializer (com.yahoo.sketches.tuple.DoubleSummaryDeserializer)13 Test (org.junit.Test)13 ArrayOfStringsSerDe (com.yahoo.sketches.ArrayOfStringsSerDe)12 ArrayOfDoublesSketch (com.yahoo.sketches.tuple.ArrayOfDoublesSketch)12 ExecException (org.apache.pig.backend.executionengine.ExecException)12 ItemsSketch (com.yahoo.sketches.frequencies.ItemsSketch)11 ArrayOfDoublesUpdatableSketchBuilder (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketchBuilder)11 Map (java.util.Map)11 ArrayOfDoublesUpdatableSketch (com.yahoo.sketches.tuple.ArrayOfDoublesUpdatableSketch)10 ArrayList (java.util.ArrayList)10 HashMap (java.util.HashMap)10