Search in sources :

Example 51 with PriorityQueue

use of java.util.PriorityQueue in project Anserini by castorini.

the class ExtractRm3Stopwords method main.

public static void main(String[] args) throws Exception {
    Args myArgs = new Args();
    CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));
    try {
        parser.parseArgument(args);
    } catch (CmdLineException e) {
        System.err.println(e.getMessage());
        parser.printUsage(System.err);
        System.err.println("Example: ExtractRm3Stopwords" + parser.printExample(OptionHandlerFilter.REQUIRED));
        return;
    }
    Directory dir = FSDirectory.open(Paths.get(myArgs.index));
    IndexReader reader = DirectoryReader.open(dir);
    Comparator<Pair> comp = new Comparator<Pair>() {

        @Override
        public int compare(Pair p1, Pair p2) {
            if (p1.value == p2.value) {
                return p1.key.compareTo(p2.key);
            } else
                return (p1.value < p2.value) ? -1 : 1;
        }
    };
    PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);
    LOG.info("Starting to iterate through all terms...");
    Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
    TermsEnum termsEnum = terms.iterator();
    BytesRef text = null;
    int cnt = 0;
    while ((text = termsEnum.next()) != null) {
        String term = text.utf8ToString();
        if (term.length() == 0)
            continue;
        Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
        if (queue.size() < myArgs.topK) {
            queue.add(p);
        } else {
            if (comp.compare(p, queue.peek()) > 0) {
                queue.poll();
                queue.add(p);
            }
        }
        cnt++;
        if (cnt % 1000000 == 0) {
            LOG.info("At term " + term);
        }
    }
    PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
    Pair pair;
    while ((pair = queue.poll()) != null) {
        out.println(pair.key);
    }
    out.close();
    LOG.info("Done!");
}
Also used : PrintStream(java.io.PrintStream) CmdLineParser(org.kohsuke.args4j.CmdLineParser) Terms(org.apache.lucene.index.Terms) Term(org.apache.lucene.index.Term) PriorityQueue(java.util.PriorityQueue) Comparator(java.util.Comparator) TermsEnum(org.apache.lucene.index.TermsEnum) FileOutputStream(java.io.FileOutputStream) IndexReader(org.apache.lucene.index.IndexReader) File(java.io.File) CmdLineException(org.kohsuke.args4j.CmdLineException) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory)

Aggregations

PriorityQueue (java.util.PriorityQueue)51 ArrayList (java.util.ArrayList)16 List (java.util.List)10 Map (java.util.Map)9 HashMap (java.util.HashMap)7 LinkedList (java.util.LinkedList)5 File (java.io.File)4 IOException (java.io.IOException)4 Entry (java.util.Map.Entry)4 Random (java.util.Random)4 BytesRef (org.apache.lucene.util.BytesRef)4 AbstractMapTable (com.ctriposs.sdb.table.AbstractMapTable)3 ScoredObject (edu.stanford.nlp.util.ScoredObject)3 Comparator (java.util.Comparator)3 Set (java.util.Set)3 FCMapTable (com.ctriposs.sdb.table.FCMapTable)2 HashMapTable (com.ctriposs.sdb.table.HashMapTable)2 IMapEntry (com.ctriposs.sdb.table.IMapEntry)2 MMFMapTable (com.ctriposs.sdb.table.MMFMapTable)2 Type (com.facebook.presto.spi.type.Type)2