Search in sources :

Example 16 with DataBag

use of org.apache.pig.data.DataBag in project varaha by thedatachef.

the class LDATopics method exec.

public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2 || input.isNull(0) || input.isNull(1))
        return null;
    // Number of topics to discover
    Integer numTopics = (Integer) input.get(0);
    // Documents, {(doc_id, text)}
    DataBag documents = (DataBag) input.get(1);
    DataBag result = BagFactory.getInstance().newDefaultBag();
    InstanceList instances = new InstanceList(pipe);
    // Add the input databag as source data and run it through the pipe built
    // by the constructor.
    instances.addThruPipe(new DataBagSourceIterator(documents));
    // Create a model with numTopics, alpha_t = 0.01, beta_w = 0.01
    // Note that the first parameter is passed as the sum over topics, while
    // the second is the parameter for a single dimension of the Dirichlet prior.
    ParallelTopicModel model = new ParallelTopicModel(numTopics, 1.0, 0.01);
    model.addInstances(instances);
    // Important, since this is being run in the reduce, just use one thread
    model.setNumThreads(1);
    model.setTopicDisplay(0, 0);
    model.setNumIterations(2000);
    model.estimate();
    // Get the results
    Alphabet dataAlphabet = instances.getDataAlphabet();
    ArrayList<TopicAssignment> assignments = model.getData();
    // Convert the results into comprehensible topics
    for (int topicNum = 0; topicNum < model.getNumTopics(); topicNum++) {
        TreeSet<IDSorter> sortedWords = model.getSortedWords().get(topicNum);
        Iterator<IDSorter> iterator = sortedWords.iterator();
        DataBag topic = BagFactory.getInstance().newDefaultBag();
        // to the databag used to represent this topic
        while (iterator.hasNext() && topic.size() < numKeywords) {
            IDSorter info = iterator.next();
            Tuple weightedWord = TupleFactory.getInstance().newTuple(2);
            // get the actual term text
            String wordToken = model.alphabet.lookupObject(info.getID()).toString();
            weightedWord.set(0, wordToken);
            // the raw weight of the term
            weightedWord.set(1, info.getWeight());
            topic.add(weightedWord);
        }
        Tuple topicTuple = TupleFactory.getInstance().newTuple(2);
        topicTuple.set(0, topicNum);
        topicTuple.set(1, topic);
        result.add(topicTuple);
    }
    return result;
}
Also used : Alphabet(cc.mallet.types.Alphabet) DataBag(org.apache.pig.data.DataBag) ParallelTopicModel(cc.mallet.topics.ParallelTopicModel) IDSorter(cc.mallet.types.IDSorter) TopicAssignment(cc.mallet.topics.TopicAssignment) InstanceList(cc.mallet.types.InstanceList) Tuple(org.apache.pig.data.Tuple)

Example 17 with DataBag

use of org.apache.pig.data.DataBag in project hive by apache.

the class HCatTypeCheck method check.

private String check(Byte type, Object o) throws IOException {
    if (o == null) {
        return "";
    }
    if (check(typeMap.get(type), o)) {
        if (type.equals(DataType.MAP)) {
            Map<String, String> m = (Map<String, String>) o;
            check(m);
        } else if (type.equals(DataType.BAG)) {
            DataBag bg = (DataBag) o;
            for (Tuple tuple : bg) {
                Map<String, String> m = (Map<String, String>) tuple.get(0);
                check(m);
            }
        } else if (type.equals(DataType.TUPLE)) {
            Tuple t = (Tuple) o;
            if (!check(Integer.class, t.get(0)) || !check(String.class, t.get(1)) || !check(Double.class, t.get(2))) {
                die("t:tuple(num:int,str:string,dbl:double)", t);
            }
        }
    } else {
        die(typeMap.get(type).getName(), o);
    }
    return o.toString();
}
Also used : DataBag(org.apache.pig.data.DataBag) Map(java.util.Map) HashMap(java.util.HashMap) Tuple(org.apache.pig.data.Tuple)

Example 18 with DataBag

use of org.apache.pig.data.DataBag in project pygmalion by jeromatron.

the class FromCassandraBag method exec.

public Tuple exec(Tuple input) throws IOException {
    // Size must be two (column_selector,cassandra_bag)
    if (input == null || input.size() < 2)
        throw new IOException("Invalid input. Please pass in both a list of column names and the columns themselves.");
    if (input.isNull(0) || input.isNull(1))
        return null;
    String columnSelector = input.get(0).toString();
    DataBag cassandraBag = (DataBag) input.get(1);
    String[] selections = DELIM_PATTERN.split(columnSelector);
    Tuple output = TupleFactory.getInstance().newTuple(selections.length);
    for (int i = 0; i < selections.length; i++) {
        String selection = selections[i];
        if (selection.endsWith(GREEDY_OPERATOR)) {
            String namePrefix = selection.substring(0, selection.length() - 1);
            DataBag columnsBag = BagFactory.getInstance().newDefaultBag();
            // and add them to the 'columnsBag'
            for (Tuple cassandraColumn : cassandraBag) {
                String name = cassandraColumn.get(0).toString();
                if (name.startsWith(namePrefix)) {
                    columnsBag.add(cassandraColumn);
                }
            }
            // Sometimes this bag will have no columns in it, this _is_ the desired behavior.
            output.set(i, columnsBag);
        } else {
            // string.
            for (Tuple cassandraColumn : cassandraBag) {
                String name = cassandraColumn.get(0).toString();
                if (name.equals(selection)) {
                    output.set(i, cassandraColumn.get(1));
                    break;
                }
            }
        }
    }
    return output;
}
Also used : DataBag(org.apache.pig.data.DataBag) IOException(java.io.IOException) Tuple(org.apache.pig.data.Tuple)

Example 19 with DataBag

use of org.apache.pig.data.DataBag in project pygmalion by jeromatron.

the class RangeBasedStringConcatTest method testAllConcat.

@Test
public void testAllConcat() throws Exception {
    RangeBasedStringConcat rbsc = new RangeBasedStringConcat("ALL", " ");
    Tuple input = new DefaultTuple();
    for (int i = 0; i < fields.length; i++) {
        input.append(fields[i]);
    }
    String result = rbsc.exec(input);
    assertEquals("a b c d e f g h i", result);
    Tuple innerTuple = new DefaultTuple();
    innerTuple.append("j");
    innerTuple.append("k");
    input.append(innerTuple);
    result = rbsc.exec(input);
    assertEquals("a b c d e f g h i j k", result);
    DataBag db = new DefaultDataBag();
    Tuple dbTuple = new DefaultTuple();
    dbTuple.append("l");
    dbTuple.append("m");
    db.add(dbTuple);
    innerTuple.append(db);
    result = rbsc.exec(input);
    assertEquals("a b c d e f g h i j k l m", result);
}
Also used : DataBag(org.apache.pig.data.DataBag) DefaultDataBag(org.apache.pig.data.DefaultDataBag) DefaultTuple(org.apache.pig.data.DefaultTuple) RangeBasedStringConcat(org.pygmalion.udf.RangeBasedStringConcat) DefaultDataBag(org.apache.pig.data.DefaultDataBag) DefaultTuple(org.apache.pig.data.DefaultTuple) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Example 20 with DataBag

use of org.apache.pig.data.DataBag in project pygmalion by jeromatron.

the class ToCassandraBagTest method test.

@Test
public void test() throws Exception {
    ToCassandraBag tcb = new ToCassandraBag();
    UDFContext context = UDFContext.getUDFContext();
    Properties properties = context.getUDFProperties(ToCassandraBag.class);
    Tuple input = new DefaultTuple();
    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < fields.length; i++) {
        builder.append(fields[i]);
        input.append("foo" + i);
        if (i < fields.length - 1) {
            builder.append(',');
        }
    }
    properties.setProperty(ToCassandraBag.UDFCONTEXT_SCHEMA_KEY + ".default_context", builder.toString());
    Tuple tuple = tcb.exec(input);
    assertNotNull("Tuple is null", tuple);
    assertEquals(2, tuple.size());
    //first is the key, rest is a set of columns
    Object one = tuple.get(0);
    assertTrue(one instanceof String);
    Object two = tuple.get(1);
    assertTrue(two instanceof DataBag);
    //Bad input
    input = new DefaultTuple();
    input.append(null);
    input.append("foo");
    try {
        tcb.exec(input);
        assertTrue(false);
    } catch (IOException e) {
    //expected
    }
    input = new DefaultTuple();
    builder.setLength(0);
    for (int i = 0; i < fields.length - 1; i++) {
        builder.append(fields[i]);
        input.append("foo" + i);
        if (i < fields.length - 1) {
            builder.append(',');
        }
    }
    properties.setProperty(ToCassandraBag.UDFCONTEXT_SCHEMA_KEY + ".default_context", builder.toString());
    input.append("foo extra");
    try {
        tcb.exec(input);
        assertTrue(false);
    } catch (IOException e) {
    }
}
Also used : DataBag(org.apache.pig.data.DataBag) ToCassandraBag(org.pygmalion.udf.ToCassandraBag) DefaultTuple(org.apache.pig.data.DefaultTuple) UDFContext(org.apache.pig.impl.util.UDFContext) IOException(java.io.IOException) Properties(java.util.Properties) DefaultTuple(org.apache.pig.data.DefaultTuple) Tuple(org.apache.pig.data.Tuple) Test(org.junit.Test)

Aggregations

DataBag (org.apache.pig.data.DataBag)32 Tuple (org.apache.pig.data.Tuple)27 Test (org.junit.Test)10 Map (java.util.Map)7 IOException (java.io.IOException)6 HashMap (java.util.HashMap)6 BasicBSONObject (org.bson.BasicBSONObject)6 ArrayList (java.util.ArrayList)5 BasicDBList (com.mongodb.BasicDBList)3 BasicDBObject (com.mongodb.BasicDBObject)3 List (java.util.List)3 Properties (java.util.Properties)3 DefaultDataBag (org.apache.pig.data.DefaultDataBag)3 UDFContext (org.apache.pig.impl.util.UDFContext)3 DateTime (org.joda.time.DateTime)3 HCatFieldSchema (org.apache.hive.hcatalog.data.schema.HCatFieldSchema)2 ResourceSchema (org.apache.pig.ResourceSchema)2 ResourceFieldSchema (org.apache.pig.ResourceSchema.ResourceFieldSchema)2 DefaultTuple (org.apache.pig.data.DefaultTuple)2 ParallelTopicModel (cc.mallet.topics.ParallelTopicModel)1