use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project Anserini by castorini.
the class IndexW2V method indexEmbeddings.
public void indexEmbeddings() throws IOException, InterruptedException {
LOG.info("Starting indexer...");
long startTime = System.currentTimeMillis();
final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
final IndexWriterConfig config = new IndexWriterConfig(analyzer);
final IndexWriter writer = new IndexWriter(directory, config);
BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
String line = null;
bRdr.readLine();
Document document = new Document();
ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
int cnt = 0;
while ((line = bRdr.readLine()) != null) {
String[] termEmbedding = line.trim().split("\t");
document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO));
String[] parts = termEmbedding[1].split(" ");
for (int i = 0; i < parts.length; ++i) {
byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array());
}
document.add(new StoredField(FIELD_BODY, byteStream.toByteArray()));
byteStream.flush();
byteStream.reset();
writer.addDocument(document);
document.clear();
cnt++;
if (cnt % 100000 == 0) {
LOG.info(cnt + " terms indexed");
}
}
LOG.info(String.format("Total of %s terms added", cnt));
try {
writer.commit();
writer.forceMerge(1);
} finally {
try {
writer.close();
} catch (IOException e) {
LOG.error(e);
}
}
LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
}
use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project Anserini by castorini.
the class Rm3Reranker method rerank.
@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
Preconditions.checkState(docs.documents.length == docs.scores.length);
IndexSearcher searcher = context.getIndexSearcher();
IndexReader reader = searcher.getIndexReader();
FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
FeatureVector rm = estimateRelevanceModel(docs, reader);
LOG.info("Relevance model estimated.");
rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
StringBuilder builder = new StringBuilder();
Iterator<String> terms = rm.iterator();
while (terms.hasNext()) {
String term = terms.next();
double prob = rm.getFeatureWeight(term);
builder.append(term + "^" + prob + " ");
}
String queryText = builder.toString().trim();
QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
Query nq = null;
try {
nq = p.parse(queryText);
} catch (ParseException e) {
e.printStackTrace();
return docs;
}
LOG.info("Running new query: " + nq);
TopDocs rs = null;
try {
if (context.getFilter() == null) {
rs = searcher.search(nq, 1000);
} else {
BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
bqBuilder.add(nq, BooleanClause.Occur.MUST);
Query q = bqBuilder.build();
rs = searcher.search(q, 1000);
}
} catch (IOException e) {
e.printStackTrace();
return docs;
}
return ScoredDocuments.fromTopDocs(rs, searcher);
}
Aggregations