use of org.apache.lucene.search.similarities.BM25Similarity in project elasticsearch by elastic.
the class BlendedTermQueryTests method setSimilarity.
public IndexSearcher setSimilarity(IndexSearcher searcher) {
Similarity similarity = random().nextBoolean() ? new BM25Similarity() : new ClassicSimilarity();
searcher.setSimilarity(similarity);
return searcher;
}
use of org.apache.lucene.search.similarities.BM25Similarity in project lucene-solr by apache.
the class TestMemoryIndex method testFreezeAPI.
@Test
public void testFreezeAPI() {
MemoryIndex mi = new MemoryIndex();
mi.addField("f1", "some text", analyzer);
assertThat(mi.search(new MatchAllDocsQuery()), not(is(0.0f)));
assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));
// check we can add a new field after searching
mi.addField("f2", "some more text", analyzer);
assertThat(mi.search(new TermQuery(new Term("f2", "some"))), not(is(0.0f)));
// freeze!
mi.freeze();
RuntimeException expected = expectThrows(RuntimeException.class, () -> {
mi.addField("f3", "and yet more", analyzer);
});
assertThat(expected.getMessage(), containsString("frozen"));
expected = expectThrows(RuntimeException.class, () -> {
mi.setSimilarity(new BM25Similarity(1, 1));
});
assertThat(expected.getMessage(), containsString("frozen"));
assertThat(mi.search(new TermQuery(new Term("f1", "some"))), not(is(0.0f)));
mi.reset();
mi.addField("f1", "wibble", analyzer);
assertThat(mi.search(new TermQuery(new Term("f1", "some"))), is(0.0f));
assertThat(mi.search(new TermQuery(new Term("f1", "wibble"))), not(is(0.0f)));
// check we can set the Similarity again
mi.setSimilarity(new ClassicSimilarity());
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class DumpTweetsLtrData method main.
public static void main(String[] argv) throws Exception {
long curTime = System.nanoTime();
LtrArgs args = new LtrArgs();
CmdLineParser parser = new CmdLineParser(args, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(argv);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: DumpTweetsLtrData" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + args.index);
Directory dir = FSDirectory.open(Paths.get(args.index));
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
if (args.ql) {
LOG.info("Using QL scoring model");
searcher.setSimilarity(new LMDirichletSimilarity(args.mu));
} else if (args.bm25) {
LOG.info("Using BM25 scoring model");
searcher.setSimilarity(new BM25Similarity(args.k1, args.b));
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
Qrels qrels = new Qrels(args.qrels);
FeatureExtractors extractors = null;
if (args.extractors != null) {
extractors = FeatureExtractors.loadExtractor(args.extractors);
}
PrintStream out = new PrintStream(new FileOutputStream(new File(args.output)));
RerankerCascade cascade = new RerankerCascade();
cascade.add(new RemoveRetweetsTemporalTiebreakReranker());
cascade.add(new TweetsLtrDataGenerator(out, qrels, extractors));
MicroblogTopicSet topics = MicroblogTopicSet.fromFile(new File(args.topics));
LOG.info("Initialized complete! (elapsed time = " + (System.nanoTime() - curTime) / 1000000 + "ms)");
long totalTime = 0;
int cnt = 0;
for (MicroblogTopic topic : topics) {
long curQueryTime = System.nanoTime();
Query filter = LongPoint.newRangeQuery(StatusField.ID.name, 0L, topic.getQueryTweetTime());
Query query = AnalyzerUtils.buildBagOfWordsQuery(StatusField.TEXT.name, IndexTweets.ANALYZER, topic.getQuery());
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filter, BooleanClause.Occur.FILTER);
builder.add(query, BooleanClause.Occur.MUST);
Query q = builder.build();
TopDocs rs = searcher.search(q, args.hits);
List<String> queryTokens = AnalyzerUtils.tokenize(IndexTweets.ANALYZER, topic.getQuery());
RerankerContext context = new RerankerContext(searcher, query, topic.getId(), topic.getQuery(), queryTokens, StatusField.TEXT.name, filter);
cascade.run(ScoredDocuments.fromTopDocs(rs, searcher), context);
long qtime = (System.nanoTime() - curQueryTime) / 1000000;
LOG.info("Query " + topic.getId() + " (elapsed time = " + qtime + "ms)");
totalTime += qtime;
cnt++;
}
LOG.info("All queries completed!");
LOG.info("Total elapsed time = " + totalTime + "ms");
LOG.info("Average query latency = " + (totalTime / cnt) + "ms");
reader.close();
out.close();
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class RetrieveSentences method search.
public Map<String, Float> search(SortedMap<Integer, String> topics, int numHits) throws IOException, ParseException {
IndexSearcher searcher = new IndexSearcher(reader);
// using BM25 scoring model
Similarity similarity = new BM25Similarity(0.9f, 0.4f);
searcher.setSimilarity(similarity);
EnglishAnalyzer ea = new EnglishAnalyzer();
QueryParser queryParser = new QueryParser(FIELD_BODY, ea);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Map<String, Float> scoredDocs = new LinkedHashMap<>();
for (Map.Entry<Integer, String> entry : topics.entrySet()) {
int qID = entry.getKey();
String queryString = entry.getValue();
Query query = AnalyzerUtils.buildBagOfWordsQuery(FIELD_BODY, ea, queryString);
TopDocs rs = searcher.search(query, numHits);
ScoreDoc[] hits = rs.scoreDocs;
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
scoredDocs.put(docs.documents[i].getField(FIELD_ID).stringValue(), docs.scores[i]);
}
}
return scoredDocs;
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class SdmQueryTest method buildTestIndex.
// A very simple example of how to build an index.
private void buildTestIndex() throws IOException {
Directory dir = FSDirectory.open(tempDir1);
IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setSimilarity(new BM25Similarity());
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
IndexWriter writer = new IndexWriter(dir, config);
FieldType textOptions = new FieldType();
textOptions.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS);
textOptions.setStored(true);
textOptions.setTokenized(true);
Document doc1 = new Document();
doc1.add(new Field(field, "john fox information river chicken bush frank retrieval world", textOptions));
writer.addDocument(doc1);
writer.commit();
writer.forceMerge(1);
writer.close();
}
Aggregations