Search in sources :

Example 1 with InstanceList

use of cc.mallet.types.InstanceList in project varaha by thedatachef.

the class LDATopics method exec.

public DataBag exec(Tuple input) throws IOException {
    if (input == null || input.size() < 2 || input.isNull(0) || input.isNull(1))
        return null;
    // Number of topics to discover
    Integer numTopics = (Integer) input.get(0);
    // Documents, {(doc_id, text)}
    DataBag documents = (DataBag) input.get(1);
    DataBag result = BagFactory.getInstance().newDefaultBag();
    InstanceList instances = new InstanceList(pipe);
    // Add the input databag as source data and run it through the pipe built
    // by the constructor.
    instances.addThruPipe(new DataBagSourceIterator(documents));
    // Create a model with numTopics, alpha_t = 0.01, beta_w = 0.01
    // Note that the first parameter is passed as the sum over topics, while
    // the second is the parameter for a single dimension of the Dirichlet prior.
    ParallelTopicModel model = new ParallelTopicModel(numTopics, 1.0, 0.01);
    model.addInstances(instances);
    // Important, since this is being run in the reduce, just use one thread
    model.setNumThreads(1);
    model.setTopicDisplay(0, 0);
    model.setNumIterations(2000);
    model.estimate();
    // Get the results
    Alphabet dataAlphabet = instances.getDataAlphabet();
    ArrayList<TopicAssignment> assignments = model.getData();
    // Convert the results into comprehensible topics
    for (int topicNum = 0; topicNum < model.getNumTopics(); topicNum++) {
        TreeSet<IDSorter> sortedWords = model.getSortedWords().get(topicNum);
        Iterator<IDSorter> iterator = sortedWords.iterator();
        DataBag topic = BagFactory.getInstance().newDefaultBag();
        // to the databag used to represent this topic
        while (iterator.hasNext() && topic.size() < numKeywords) {
            IDSorter info = iterator.next();
            Tuple weightedWord = TupleFactory.getInstance().newTuple(2);
            // get the actual term text
            String wordToken = model.alphabet.lookupObject(info.getID()).toString();
            weightedWord.set(0, wordToken);
            // the raw weight of the term
            weightedWord.set(1, info.getWeight());
            topic.add(weightedWord);
        }
        Tuple topicTuple = TupleFactory.getInstance().newTuple(2);
        topicTuple.set(0, topicNum);
        topicTuple.set(1, topic);
        result.add(topicTuple);
    }
    return result;
}
Also used : Alphabet(cc.mallet.types.Alphabet) DataBag(org.apache.pig.data.DataBag) ParallelTopicModel(cc.mallet.topics.ParallelTopicModel) IDSorter(cc.mallet.types.IDSorter) TopicAssignment(cc.mallet.topics.TopicAssignment) InstanceList(cc.mallet.types.InstanceList) Tuple(org.apache.pig.data.Tuple)

Aggregations

ParallelTopicModel (cc.mallet.topics.ParallelTopicModel)1 TopicAssignment (cc.mallet.topics.TopicAssignment)1 Alphabet (cc.mallet.types.Alphabet)1 IDSorter (cc.mallet.types.IDSorter)1 InstanceList (cc.mallet.types.InstanceList)1 DataBag (org.apache.pig.data.DataBag)1 Tuple (org.apache.pig.data.Tuple)1