Search in sources :

Example 36 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project Anserini by castorini.

the class IndexW2V method indexEmbeddings.

public void indexEmbeddings() throws IOException, InterruptedException {
    LOG.info("Starting indexer...");
    long startTime = System.currentTimeMillis();
    final WhitespaceAnalyzer analyzer = new WhitespaceAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    final IndexWriter writer = new IndexWriter(directory, config);
    BufferedReader bRdr = new BufferedReader(new FileReader(args.input));
    String line = null;
    bRdr.readLine();
    Document document = new Document();
    ByteArrayOutputStream byteStream = new ByteArrayOutputStream();
    int cnt = 0;
    while ((line = bRdr.readLine()) != null) {
        String[] termEmbedding = line.trim().split("\t");
        document.add(new StringField(LuceneDocumentGenerator.FIELD_ID, termEmbedding[0], Field.Store.NO));
        String[] parts = termEmbedding[1].split(" ");
        for (int i = 0; i < parts.length; ++i) {
            byteStream.write(ByteBuffer.allocate(4).putFloat(Float.parseFloat(parts[i])).array());
        }
        document.add(new StoredField(FIELD_BODY, byteStream.toByteArray()));
        byteStream.flush();
        byteStream.reset();
        writer.addDocument(document);
        document.clear();
        cnt++;
        if (cnt % 100000 == 0) {
            LOG.info(cnt + " terms indexed");
        }
    }
    LOG.info(String.format("Total of %s terms added", cnt));
    try {
        writer.commit();
        writer.forceMerge(1);
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            LOG.error(e);
        }
    }
    LOG.info("Total elapsed time: " + (System.currentTimeMillis() - startTime) + "ms");
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) ByteArrayOutputStream(org.apache.commons.io.output.ByteArrayOutputStream) IOException(java.io.IOException) Document(org.apache.lucene.document.Document) StoredField(org.apache.lucene.document.StoredField) IndexWriter(org.apache.lucene.index.IndexWriter) StringField(org.apache.lucene.document.StringField) BufferedReader(java.io.BufferedReader) FileReader(java.io.FileReader) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 37 with WhitespaceAnalyzer

use of org.apache.lucene.analysis.core.WhitespaceAnalyzer in project Anserini by castorini.

the class Rm3Reranker method rerank.

@Override
public ScoredDocuments rerank(ScoredDocuments docs, RerankerContext context) {
    Preconditions.checkState(docs.documents.length == docs.scores.length);
    IndexSearcher searcher = context.getIndexSearcher();
    IndexReader reader = searcher.getIndexReader();
    FeatureVector qfv = FeatureVector.fromTerms(AnalyzerUtils.tokenize(analyzer, context.getQueryText())).scaleToUnitL1Norm();
    FeatureVector rm = estimateRelevanceModel(docs, reader);
    LOG.info("Relevance model estimated.");
    rm = FeatureVector.interpolate(qfv, rm, originalQueryWeight);
    StringBuilder builder = new StringBuilder();
    Iterator<String> terms = rm.iterator();
    while (terms.hasNext()) {
        String term = terms.next();
        double prob = rm.getFeatureWeight(term);
        builder.append(term + "^" + prob + " ");
    }
    String queryText = builder.toString().trim();
    QueryParser p = new QueryParser(field, new WhitespaceAnalyzer());
    Query nq = null;
    try {
        nq = p.parse(queryText);
    } catch (ParseException e) {
        e.printStackTrace();
        return docs;
    }
    LOG.info("Running new query: " + nq);
    TopDocs rs = null;
    try {
        if (context.getFilter() == null) {
            rs = searcher.search(nq, 1000);
        } else {
            BooleanQuery.Builder bqBuilder = new BooleanQuery.Builder();
            bqBuilder.add(context.getFilter(), BooleanClause.Occur.FILTER);
            bqBuilder.add(nq, BooleanClause.Occur.MUST);
            Query q = bqBuilder.build();
            rs = searcher.search(q, 1000);
        }
    } catch (IOException e) {
        e.printStackTrace();
        return docs;
    }
    return ScoredDocuments.fromTopDocs(rs, searcher);
}
Also used : FeatureVector(io.anserini.util.FeatureVector) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) IOException(java.io.IOException) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) IndexReader(org.apache.lucene.index.IndexReader) ParseException(org.apache.lucene.queryparser.classic.ParseException)

Aggregations

WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)37 IndexWriter (org.apache.lucene.index.IndexWriter)17 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)17 Document (org.apache.lucene.document.Document)16 Analyzer (org.apache.lucene.analysis.Analyzer)9 Test (org.junit.Test)9 NamedList (org.apache.solr.common.util.NamedList)8 ArrayList (java.util.ArrayList)7 Token (org.apache.lucene.analysis.Token)7 TextField (org.apache.lucene.document.TextField)7 IndexSearcher (org.apache.lucene.search.IndexSearcher)6 IOException (java.io.IOException)5 HashMap (java.util.HashMap)5 Field (org.apache.lucene.document.Field)5 DirectoryTaxonomyWriter (org.apache.lucene.facet.taxonomy.directory.DirectoryTaxonomyWriter)5 DirectoryReader (org.apache.lucene.index.DirectoryReader)5 TokenStream (org.apache.lucene.analysis.TokenStream)4 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)4 LongPoint (org.apache.lucene.document.LongPoint)4 BooleanQuery (org.apache.lucene.search.BooleanQuery)4