use of java.util.PriorityQueue in project Anserini by castorini.
the class ExtractRm3Stopwords method main.
public static void main(String[] args) throws Exception {
Args myArgs = new Args();
CmdLineParser parser = new CmdLineParser(myArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: ExtractRm3Stopwords" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
Directory dir = FSDirectory.open(Paths.get(myArgs.index));
IndexReader reader = DirectoryReader.open(dir);
Comparator<Pair> comp = new Comparator<Pair>() {
@Override
public int compare(Pair p1, Pair p2) {
if (p1.value == p2.value) {
return p1.key.compareTo(p2.key);
} else
return (p1.value < p2.value) ? -1 : 1;
}
};
PriorityQueue<Pair> queue = new PriorityQueue<Pair>(myArgs.topK, comp);
LOG.info("Starting to iterate through all terms...");
Terms terms = MultiFields.getFields(reader).terms(myArgs.field);
TermsEnum termsEnum = terms.iterator();
BytesRef text = null;
int cnt = 0;
while ((text = termsEnum.next()) != null) {
String term = text.utf8ToString();
if (term.length() == 0)
continue;
Pair p = new Pair(term, reader.docFreq(new Term(myArgs.field, term)));
if (queue.size() < myArgs.topK) {
queue.add(p);
} else {
if (comp.compare(p, queue.peek()) > 0) {
queue.poll();
queue.add(p);
}
}
cnt++;
if (cnt % 1000000 == 0) {
LOG.info("At term " + term);
}
}
PrintStream out = new PrintStream(new FileOutputStream(new File(myArgs.output)));
Pair pair;
while ((pair = queue.poll()) != null) {
out.println(pair.key);
}
out.close();
LOG.info("Done!");
}
Aggregations