use of cc.mallet.types.InstanceList in project varaha by thedatachef.
the class LDATopics method exec.
public DataBag exec(Tuple input) throws IOException {
if (input == null || input.size() < 2 || input.isNull(0) || input.isNull(1))
return null;
// Number of topics to discover
Integer numTopics = (Integer) input.get(0);
// Documents, {(doc_id, text)}
DataBag documents = (DataBag) input.get(1);
DataBag result = BagFactory.getInstance().newDefaultBag();
InstanceList instances = new InstanceList(pipe);
// Add the input databag as source data and run it through the pipe built
// by the constructor.
instances.addThruPipe(new DataBagSourceIterator(documents));
// Create a model with numTopics, alpha_t = 0.01, beta_w = 0.01
// Note that the first parameter is passed as the sum over topics, while
// the second is the parameter for a single dimension of the Dirichlet prior.
ParallelTopicModel model = new ParallelTopicModel(numTopics, 1.0, 0.01);
model.addInstances(instances);
// Important, since this is being run in the reduce, just use one thread
model.setNumThreads(1);
model.setTopicDisplay(0, 0);
model.setNumIterations(2000);
model.estimate();
// Get the results
Alphabet dataAlphabet = instances.getDataAlphabet();
ArrayList<TopicAssignment> assignments = model.getData();
// Convert the results into comprehensible topics
for (int topicNum = 0; topicNum < model.getNumTopics(); topicNum++) {
TreeSet<IDSorter> sortedWords = model.getSortedWords().get(topicNum);
Iterator<IDSorter> iterator = sortedWords.iterator();
DataBag topic = BagFactory.getInstance().newDefaultBag();
// to the databag used to represent this topic
while (iterator.hasNext() && topic.size() < numKeywords) {
IDSorter info = iterator.next();
Tuple weightedWord = TupleFactory.getInstance().newTuple(2);
// get the actual term text
String wordToken = model.alphabet.lookupObject(info.getID()).toString();
weightedWord.set(0, wordToken);
// the raw weight of the term
weightedWord.set(1, info.getWeight());
topic.add(weightedWord);
}
Tuple topicTuple = TupleFactory.getInstance().newTuple(2);
topicTuple.set(0, topicNum);
topicTuple.set(1, topic);
result.add(topicTuple);
}
return result;
}
Aggregations