Search in sources :

Example 1 with MyPriorityQueue

use of structures.MyPriorityQueue in project IR_Base by Linda-sunshine.

the class PageRank method constructSparseGraph.

private void constructSparseGraph(ArrayList<_Doc> collection) {
    m_N = collection.size();
    // we need to make this very sparse!
    m_transition = new SparseDoubleMatrix2D(m_N, m_N);
    // construct the connection
    MyPriorityQueue<_RankItem> queue = new MyPriorityQueue<_RankItem>(m_topK);
    for (int i = 0; i < collection.size(); i++) {
        _Doc di = collection.get(i);
        // find k-nearest neighbor
        for (int j = 0; j < collection.size(); j++) {
            if (i != j)
                queue.add(new _RankItem(j, Utils.dotProduct(di, collection.get(j))));
        }
        // transition probability is proportion to similarity
        double sum = 0;
        for (_RankItem item : queue) {
            item.m_value = Math.exp(item.m_value);
            sum += item.m_value;
        }
        // set up the transition
        for (_RankItem item : queue) // i -> j
        m_transition.setQuick(i, item.m_index, item.m_value / sum);
        queue.clear();
    }
}
Also used : structures._RankItem(structures._RankItem) MyPriorityQueue(structures.MyPriorityQueue) structures._Doc(structures._Doc) SparseDoubleMatrix2D(cern.colt.matrix.tdouble.impl.SparseDoubleMatrix2D)

Example 2 with MyPriorityQueue

use of structures.MyPriorityQueue in project IR_Base by Linda-sunshine.

the class L2RMetricLearning method createTrainingCorpus.

// In this training process, we want to get the weight of all pairs of samples.
protected int createTrainingCorpus() {
    // pre-compute the similarity between labeled documents
    calcLabeledSimilarities();
    MyPriorityQueue<_RankItem> simRanker = new MyPriorityQueue<_RankItem>(m_topK);
    ArrayList<_Doc> neighbors = new ArrayList<_Doc>();
    _Query q;
    _Doc di, dj;
    int posQ = 0, negQ = 0, pairSize = 0;
    int relevant = 0, irrelevant = 0;
    for (int i = 0; i < m_trainSet.size(); i++) {
        // candidate query document
        di = m_trainSet.get(i);
        relevant = 0;
        irrelevant = 0;
        // using content similarity to construct initial ranking
        for (int j = 0; j < m_trainSet.size(); j++) {
            if (i == j)
                continue;
            dj = m_trainSet.get(j);
            simRanker.add(new _RankItem(j, m_LabeledCache[getIndex(i, j)]));
        }
        // find the top K similar documents by default similarity measure
        for (_RankItem it : simRanker) {
            dj = m_trainSet.get(it.m_index);
            neighbors.add(dj);
            if (di.getYLabel() == dj.getYLabel())
                relevant++;
            else
                irrelevant++;
        }
        // inject some random neighbors
        int j = 0;
        while (neighbors.size() < (1.0 + m_noiseRatio) * m_topK) {
            if (i != j) {
                dj = m_trainSet.get(j);
                if (Math.random() < 0.02 && !neighbors.contains(dj)) {
                    neighbors.add(dj);
                    if (di.getYLabel() == dj.getYLabel())
                        relevant++;
                    else
                        irrelevant++;
                }
            }
            // until we use up all the random budget
            j = (j + 1) % m_trainSet.size();
        }
        if (relevant == 0 || irrelevant == 0 || (di.getYLabel() == 1 && negQ < 1.1 * posQ)) {
            // clear the cache for next query
            simRanker.clear();
            neighbors.clear();
            continue;
        } else if (di.getYLabel() == 1)
            posQ++;
        else
            negQ++;
        // accept the query
        q = new _Query();
        m_queries.add(q);
        // construct features for the most similar documents with respect to the query di
        for (_Doc d : neighbors) q.addQUPair(new _QUPair(d.getYLabel() == di.getYLabel() ? 1 : 0, genRankingFV(di, d)));
        pairSize += q.createRankingPairs();
        // clear the cache for next query
        simRanker.clear();
        neighbors.clear();
    }
    // normalize the features by z-score
    normalize();
    System.out.format("Generate %d(%d:%d) ranking pairs for L2R model training...\n", pairSize, posQ, negQ);
    return pairSize;
}
Also used : structures._Query(structures._Query) structures._QUPair(structures._QUPair) structures._RankItem(structures._RankItem) MyPriorityQueue(structures.MyPriorityQueue) structures._Doc(structures._Doc) ArrayList(java.util.ArrayList)

Example 3 with MyPriorityQueue

use of structures.MyPriorityQueue in project IR_Base by Linda-sunshine.

the class DCMLDA4AC_test method printWordTopicDistribution.

protected void printWordTopicDistribution(_Doc d, File wordTopicDistributionFolder, int k) {
    _ParentDoc4DCM pDoc = (_ParentDoc4DCM) d;
    String wordTopicDistributionFile = pDoc.getName() + ".txt";
    try {
        PrintWriter pw = new PrintWriter(new File(wordTopicDistributionFolder, wordTopicDistributionFile));
        for (int i = 0; i < number_of_topics; i++) {
            MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(k);
            for (int v = 0; v < vocabulary_size; v++) {
                String featureName = m_corpus.getFeature(v);
                double wordProb = pDoc.m_wordTopic_prob[i][v];
                _RankItem ri = new _RankItem(featureName, wordProb);
                fVector.add(ri);
            }
            pw.format("Topic %d(%.5f):\t", i, d.m_topics[i]);
            for (_RankItem it : fVector) pw.format("%s(%.5f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
            pw.write("\n");
        }
        pw.flush();
        pw.close();
    } catch (FileNotFoundException e) {
        e.printStackTrace();
    }
}
Also used : structures._RankItem(structures._RankItem) MyPriorityQueue(structures.MyPriorityQueue) structures._ParentDoc4DCM(structures._ParentDoc4DCM) FileNotFoundException(java.io.FileNotFoundException) File(java.io.File) PrintWriter(java.io.PrintWriter)

Example 4 with MyPriorityQueue

use of structures.MyPriorityQueue in project IR_Base by Linda-sunshine.

the class DCMLDA4AC_test method printTopBeta.

public void printTopBeta(int k, String topWordPath) {
    try {
        PrintWriter topWordWriter = new PrintWriter(new File(topWordPath));
        for (int i = 0; i < m_beta.length; i++) {
            MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(k);
            for (int j = 0; j < vocabulary_size; j++) fVector.add(new _RankItem(m_corpus.getFeature(j), m_beta[i][j]));
            topWordWriter.format("Topic %d(%.5f):\t", i, m_sstat[i]);
            for (_RankItem it : fVector) topWordWriter.format("%s(%.5f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
            topWordWriter.write("\n");
        }
        topWordWriter.close();
    } catch (Exception ex) {
        System.err.print("File Not Found");
    }
}
Also used : structures._RankItem(structures._RankItem) MyPriorityQueue(structures.MyPriorityQueue) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) PrintWriter(java.io.PrintWriter)

Example 5 with MyPriorityQueue

use of structures.MyPriorityQueue in project IR_Base by Linda-sunshine.

the class DCMLDA4AC_test method printTopWordsDistribution.

protected void printTopWordsDistribution(int topK, String topWordFile) {
    Arrays.fill(m_sstat, 0);
    System.out.println("print top words");
    for (_Doc d : m_trainSet) {
        for (int i = 0; i < number_of_topics; i++) m_sstat[i] += m_logSpace ? Math.exp(d.m_topics[i]) : d.m_topics[i];
    }
    Utils.L1Normalization(m_sstat);
    try {
        System.out.println("top word file");
        PrintWriter betaOut = new PrintWriter(new File(topWordFile));
        for (int i = 0; i < m_topic_word_prob.length; i++) {
            MyPriorityQueue<_RankItem> fVector = new MyPriorityQueue<_RankItem>(topK);
            for (int j = 0; j < vocabulary_size; j++) fVector.add(new _RankItem(m_corpus.getFeature(j), m_topic_word_prob[i][j]));
            betaOut.format("Topic %d(%.3f):\t", i, m_sstat[i]);
            for (_RankItem it : fVector) {
                betaOut.format("%s(%.3f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
                System.out.format("%s(%.3f)\t", it.m_name, m_logSpace ? Math.exp(it.m_value) : it.m_value);
            }
            betaOut.println();
            System.out.println();
        }
        betaOut.flush();
        betaOut.close();
    } catch (Exception ex) {
        System.err.print("File Not Found");
    }
}
Also used : structures._RankItem(structures._RankItem) structures._Doc(structures._Doc) MyPriorityQueue(structures.MyPriorityQueue) File(java.io.File) FileNotFoundException(java.io.FileNotFoundException) PrintWriter(java.io.PrintWriter)

Aggregations

MyPriorityQueue (structures.MyPriorityQueue)39 structures._RankItem (structures._RankItem)39 File (java.io.File)27 PrintWriter (java.io.PrintWriter)27 structures._Doc (structures._Doc)25 FileNotFoundException (java.io.FileNotFoundException)20 structures._ParentDoc4DCM (structures._ParentDoc4DCM)3 IOException (java.io.IOException)2 structures._Doc4DCMLDA (structures._Doc4DCMLDA)2 structures._Review (structures._Review)2 SparseDoubleMatrix2D (cern.colt.matrix.tdouble.impl.SparseDoubleMatrix2D)1 ArrayList (java.util.ArrayList)1 structures._HDPThetaStar (structures._HDPThetaStar)1 structures._Node (structures._Node)1 structures._QUPair (structures._QUPair)1 structures._Query (structures._Query)1 structures._SparseFeature (structures._SparseFeature)1 structures._Stn (structures._Stn)1 structures._thetaStar (structures._thetaStar)1