Search in sources :

Example 46 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class TestNorms method buildIndex.

// TODO: create a testNormsNotPresent ourselves by adding/deleting/merging docs
public void buildIndex(Directory dir) throws IOException {
    Random random = random();
    MockAnalyzer analyzer = new MockAnalyzer(random());
    analyzer.setMaxTokenLength(TestUtil.nextInt(random(), 1, IndexWriter.MAX_TERM_LENGTH));
    IndexWriterConfig config = newIndexWriterConfig(analyzer);
    Similarity provider = new MySimProvider();
    config.setSimilarity(provider);
    RandomIndexWriter writer = new RandomIndexWriter(random, dir, config);
    final LineFileDocs docs = new LineFileDocs(random);
    int num = atLeast(100);
    for (int i = 0; i < num; i++) {
        Document doc = docs.nextDoc();
        int boost = TestUtil.nextInt(random, 1, 255);
        String value = IntStream.range(0, boost).mapToObj(k -> Integer.toString(boost)).collect(Collectors.joining(" "));
        Field f = new TextField(BYTE_TEST_FIELD, value, Field.Store.YES);
        doc.add(f);
        writer.addDocument(doc);
        doc.removeField(BYTE_TEST_FIELD);
        if (rarely()) {
            writer.commit();
        }
    }
    writer.commit();
    writer.close();
    docs.close();
}
Also used : IntStream(java.util.stream.IntStream) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) SuppressCodecs(org.apache.lucene.util.LuceneTestCase.SuppressCodecs) Slow(org.apache.lucene.util.LuceneTestCase.Slow) TestUtil(org.apache.lucene.util.TestUtil) IOException(java.io.IOException) Random(java.util.Random) PerFieldSimilarityWrapper(org.apache.lucene.search.similarities.PerFieldSimilarityWrapper) Collectors(java.util.stream.Collectors) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) CollectionStatistics(org.apache.lucene.search.CollectionStatistics) LineFileDocs(org.apache.lucene.util.LineFileDocs) Field(org.apache.lucene.document.Field) Similarity(org.apache.lucene.search.similarities.Similarity) Directory(org.apache.lucene.store.Directory) Store(org.apache.lucene.document.Field.Store) LuceneTestCase(org.apache.lucene.util.LuceneTestCase) TextField(org.apache.lucene.document.TextField) TermStatistics(org.apache.lucene.search.TermStatistics) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Similarity(org.apache.lucene.search.similarities.Similarity) Document(org.apache.lucene.document.Document) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Random(java.util.Random) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) TextField(org.apache.lucene.document.TextField) LineFileDocs(org.apache.lucene.util.LineFileDocs)

Example 47 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project lucene-solr by apache.

the class TestDiversifiedTopDocsCollector method setUp.

@Override
public void setUp() throws Exception {
    super.setUp();
    // populate an index with documents - artist, song and weeksAtNumberOne
    dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    Field yearField = newTextField("year", "", Field.Store.NO);
    SortedDocValuesField artistField = new SortedDocValuesField("artist", new BytesRef(""));
    Field weeksAtNumberOneField = new FloatDocValuesField("weeksAtNumberOne", 0.0F);
    Field weeksStoredField = new StoredField("weeks", 0.0F);
    Field idField = newStringField("id", "", Field.Store.YES);
    Field songField = newTextField("song", "", Field.Store.NO);
    Field storedArtistField = newTextField("artistName", "", Field.Store.NO);
    doc.add(idField);
    doc.add(weeksAtNumberOneField);
    doc.add(storedArtistField);
    doc.add(songField);
    doc.add(weeksStoredField);
    doc.add(yearField);
    doc.add(artistField);
    parsedRecords.clear();
    for (int i = 0; i < hitsOfThe60s.length; i++) {
        String[] cols = hitsOfThe60s[i].split("\t");
        Record record = new Record(String.valueOf(i), cols[0], cols[1], cols[2], Float.parseFloat(cols[3]));
        parsedRecords.put(record.id, record);
        idField.setStringValue(record.id);
        yearField.setStringValue(record.year);
        storedArtistField.setStringValue(record.artist);
        artistField.setBytesValue(new BytesRef(record.artist));
        songField.setStringValue(record.song);
        weeksStoredField.setFloatValue(record.weeks);
        weeksAtNumberOneField.setFloatValue(record.weeks);
        writer.addDocument(doc);
        if (i % 10 == 0) {
            // Causes the creation of multiple segments for our test
            writer.commit();
        }
    }
    reader = writer.getReader();
    writer.close();
    searcher = newSearcher(reader);
    artistDocValues = MultiDocValues.getSortedValues(reader, "artist");
    // All searches sort by song popularity 
    final Similarity base = searcher.getSimilarity(true);
    searcher.setSimilarity(new DocValueSimilarity(base, "weeksAtNumberOne"));
}
Also used : Similarity(org.apache.lucene.search.similarities.Similarity) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) Document(org.apache.lucene.document.Document) FloatDocValuesField(org.apache.lucene.document.FloatDocValuesField) StoredField(org.apache.lucene.document.StoredField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) Field(org.apache.lucene.document.Field) StoredField(org.apache.lucene.document.StoredField) SortedDocValuesField(org.apache.lucene.document.SortedDocValuesField) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef)

Example 48 with Similarity

use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.

the class RetrieveSentences method search.

public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
    IndexSearcher searcher = new IndexSearcher(reader);
    //using BM25 scoring model
    Similarity similarity = new BM25Similarity(0.9f, 0.4f);
    searcher.setSimilarity(similarity);
    EnglishAnalyzer ea = new EnglishAnalyzer();
    QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Map<String, Float> scoredDocs = new LinkedHashMap<>();
    for (Map.Entry<Integer, String> entry : topics.entrySet()) {
        int qID = entry.getKey();
        String queryString = entry.getValue();
        Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
        TopDocs rs = searcher.search(query, numHits);
        ScoreDoc[] hits = rs.scoreDocs;
        ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
        for (int i = 0; i < docs.documents.length; i++) {
            scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
        }
    }
    return scoredDocs;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) Query(org.apache.lucene.search.Query) ScoredDocuments(io.anserini.rerank.ScoredDocuments) EnglishAnalyzer(org.apache.lucene.analysis.en.EnglishAnalyzer) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Aggregations

Similarity (org.apache.lucene.search.similarities.Similarity)48 BM25Similarity (org.apache.lucene.search.similarities.BM25Similarity)15 ClassicSimilarity (org.apache.lucene.search.similarities.ClassicSimilarity)15 Directory (org.apache.lucene.store.Directory)9 PerFieldSimilarityWrapper (org.apache.lucene.search.similarities.PerFieldSimilarityWrapper)8 SweetSpotSimilarity (org.apache.lucene.misc.SweetSpotSimilarity)7 IOException (java.io.IOException)6 Document (org.apache.lucene.document.Document)5 Term (org.apache.lucene.index.Term)5 IndexSearcher (org.apache.lucene.search.IndexSearcher)5 Collectors (java.util.stream.Collectors)4 IntStream (java.util.stream.IntStream)4 Field (org.apache.lucene.document.Field)4 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)4 NormValueSource (org.apache.lucene.queries.function.valuesource.NormValueSource)4 BytesRef (org.apache.lucene.util.BytesRef)4 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)3 Store (org.apache.lucene.document.Field.Store)3 IndexWriter (org.apache.lucene.index.IndexWriter)3 ConstValueSource (org.apache.lucene.queries.function.valuesource.ConstValueSource)3