use of org.apache.pig.data.DataBag in project varaha by thedatachef.
the class LDATopics method exec.
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 2 || input.isNull(0) || input.isNull(1))
return null;
// Number of topics to discover
Integer numTopics = (Integer) input.get(0);
// Documents, {(doc_id, text)}
DataBag documents = (DataBag) input.get(1);
DataBag result = BagFactory.getInstance().newDefaultBag();
InstanceList instances = new InstanceList(pipe);
// Add the input databag as source data and run it through the pipe built
// by the constructor.
instances.addThruPipe(new DataBagSourceIterator(documents));
// Create a model with numTopics, alpha_t = 0.01, beta_w = 0.01
// Note that the first parameter is passed as the sum over topics, while
// the second is the parameter for a single dimension of the Dirichlet prior.
ParallelTopicModel model = new ParallelTopicModel(numTopics, 1.0, 0.01);
model.addInstances(instances);
// Important, since this is being run in the reduce, just use one thread
model.setNumThreads(1);
model.setTopicDisplay(0, 0);
model.setNumIterations(2000);
model.estimate();
// Get the results
Alphabet dataAlphabet = instances.getDataAlphabet();
ArrayList<TopicAssignment> assignments = model.getData();
// Convert the results into comprehensible topics
for (int topicNum = 0; topicNum < model.getNumTopics(); topicNum++) {
TreeSet<IDSorter> sortedWords = model.getSortedWords().get(topicNum);
Iterator<IDSorter> iterator = sortedWords.iterator();
DataBag topic = BagFactory.getInstance().newDefaultBag();
// to the databag used to represent this topic
while (iterator.hasNext() && topic.size() < numKeywords) {
IDSorter info = iterator.next();
Tuple weightedWord = TupleFactory.getInstance().newTuple(2);
// get the actual term text
String wordToken = model.alphabet.lookupObject(info.getID()).toString();
weightedWord.set(0, wordToken);
// the raw weight of the term
weightedWord.set(1, info.getWeight());
topic.add(weightedWord);
}
Tuple topicTuple = TupleFactory.getInstance().newTuple(2);
topicTuple.set(0, topicNum);
topicTuple.set(1, topic);
result.add(topicTuple);
}
return result;
}
use of org.apache.pig.data.DataBag in project hive by apache.
the class HCatTypeCheck method check.
private String check(Byte type, Object o) throws IOException {
if (o == null) {
return "";
}
if (check(typeMap.get(type), o)) {
if (type.equals(DataType.MAP)) {
Map<String, String> m = (Map<String, String>) o;
check(m);
} else if (type.equals(DataType.BAG)) {
DataBag bg = (DataBag) o;
for (Tuple tuple : bg) {
Map<String, String> m = (Map<String, String>) tuple.get(0);
check(m);
}
} else if (type.equals(DataType.TUPLE)) {
Tuple t = (Tuple) o;
if (!check(Integer.class, t.get(0)) || !check(String.class, t.get(1)) || !check(Double.class, t.get(2))) {
die("t:tuple(num:int,str:string,dbl:double)", t);
}
}
} else {
die(typeMap.get(type).getName(), o);
}
return o.toString();
}
use of org.apache.pig.data.DataBag in project pygmalion by jeromatron.
the class FromCassandraBag method exec.
public Tuple exec(Tuple input) throws IOException {
// Size must be two (column_selector,cassandra_bag)
if (input == null || input.size() < 2)
throw new IOException("Invalid input. Please pass in both a list of column names and the columns themselves.");
if (input.isNull(0) || input.isNull(1))
return null;
String columnSelector = input.get(0).toString();
DataBag cassandraBag = (DataBag) input.get(1);
String[] selections = DELIM_PATTERN.split(columnSelector);
Tuple output = TupleFactory.getInstance().newTuple(selections.length);
for (int i = 0; i < selections.length; i++) {
String selection = selections[i];
if (selection.endsWith(GREEDY_OPERATOR)) {
String namePrefix = selection.substring(0, selection.length() - 1);
DataBag columnsBag = BagFactory.getInstance().newDefaultBag();
// and add them to the 'columnsBag'
for (Tuple cassandraColumn : cassandraBag) {
String name = cassandraColumn.get(0).toString();
if (name.startsWith(namePrefix)) {
columnsBag.add(cassandraColumn);
}
}
// Sometimes this bag will have no columns in it, this _is_ the desired behavior.
output.set(i, columnsBag);
} else {
// string.
for (Tuple cassandraColumn : cassandraBag) {
String name = cassandraColumn.get(0).toString();
if (name.equals(selection)) {
output.set(i, cassandraColumn.get(1));
break;
}
}
}
}
return output;
}
use of org.apache.pig.data.DataBag in project pygmalion by jeromatron.
the class RangeBasedStringConcatTest method testAllConcat.
@Test
public void testAllConcat() throws Exception {
RangeBasedStringConcat rbsc = new RangeBasedStringConcat("ALL", " ");
Tuple input = new DefaultTuple();
for (int i = 0; i < fields.length; i++) {
input.append(fields[i]);
}
String result = rbsc.exec(input);
assertEquals("a b c d e f g h i", result);
Tuple innerTuple = new DefaultTuple();
innerTuple.append("j");
innerTuple.append("k");
input.append(innerTuple);
result = rbsc.exec(input);
assertEquals("a b c d e f g h i j k", result);
DataBag db = new DefaultDataBag();
Tuple dbTuple = new DefaultTuple();
dbTuple.append("l");
dbTuple.append("m");
db.add(dbTuple);
innerTuple.append(db);
result = rbsc.exec(input);
assertEquals("a b c d e f g h i j k l m", result);
}
use of org.apache.pig.data.DataBag in project pygmalion by jeromatron.
the class ToCassandraBagTest method test.
@Test
public void test() throws Exception {
ToCassandraBag tcb = new ToCassandraBag();
UDFContext context = UDFContext.getUDFContext();
Properties properties = context.getUDFProperties(ToCassandraBag.class);
Tuple input = new DefaultTuple();
StringBuilder builder = new StringBuilder();
for (int i = 0; i < fields.length; i++) {
builder.append(fields[i]);
input.append("foo" + i);
if (i < fields.length - 1) {
builder.append(',');
}
}
properties.setProperty(ToCassandraBag.UDFCONTEXT_SCHEMA_KEY + ".default_context", builder.toString());
Tuple tuple = tcb.exec(input);
assertNotNull("Tuple is null", tuple);
assertEquals(2, tuple.size());
//first is the key, rest is a set of columns
Object one = tuple.get(0);
assertTrue(one instanceof String);
Object two = tuple.get(1);
assertTrue(two instanceof DataBag);
//Bad input
input = new DefaultTuple();
input.append(null);
input.append("foo");
try {
tcb.exec(input);
assertTrue(false);
} catch (IOException e) {
//expected
}
input = new DefaultTuple();
builder.setLength(0);
for (int i = 0; i < fields.length - 1; i++) {
builder.append(fields[i]);
input.append("foo" + i);
if (i < fields.length - 1) {
builder.append(',');
}
}
properties.setProperty(ToCassandraBag.UDFCONTEXT_SCHEMA_KEY + ".default_context", builder.toString());
input.append("foo extra");
try {
tcb.exec(input);
assertTrue(false);
} catch (IOException e) {
}
}
Aggregations