use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class SimpleSearcher method setBM25.
/**
* Specifies use of BM25 as the scoring function.
*
* @param k1 k1 parameter
* @param b b parameter
*/
public void setBM25(float k1, float b) {
this.similarity = new BM25Similarity(k1, b);
// We need to re-initialize the searcher
searcher = new IndexSearcher(reader);
searcher.setSimilarity(similarity);
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class IndexReaderUtils method getBM25AnalyzedTermWeightWithParameters.
/**
* Computes the BM25 weight of an analyzed term in a particular document.
*
* @param reader index reader
* @param docid collection docid
* @param term analyzed term
* @param k1 k1 setting for BM25
* @param b b setting for BM25
* @return BM25 weight of the term in the specified document
* @throws IOException if error encountered during query
*/
public static float getBM25AnalyzedTermWeightWithParameters(IndexReader reader, String docid, String term, float k1, float b) throws IOException {
// We compute the BM25 score by issuing a single-term query with an additional filter clause that restricts
// consideration to only the docid in question, and then returning the retrieval score.
//
// This implementation is inefficient, but as the advantage of using the existing Lucene similarity, which means
// that we don't need to copy the scoring function and keep it in sync wrt code updates.
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(new BM25Similarity(k1, b));
Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term(IndexArgs.ID, docid)));
Query termQuery = new TermQuery(new Term(IndexArgs.CONTENTS, term));
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filterQuery, BooleanClause.Occur.MUST);
builder.add(termQuery, BooleanClause.Occur.MUST);
Query finalQuery = builder.build();
TopDocs rs = searcher.search(finalQuery, 1);
// If we get zero results, indicates that term isn't found in the document.
return rs.scoreDocs.length == 0 ? 0 : rs.scoreDocs[0].score - 1;
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class IndexCollection method run.
public Counters run() throws IOException {
final long start = System.nanoTime();
LOG.info("============ Indexing Collection ============");
int numThreads = args.threads;
IndexWriter writer = null;
// Used for LocalIndexThread
if (indexPath != null) {
final Directory dir = FSDirectory.open(indexPath);
final CJKAnalyzer chineseAnalyzer = new CJKAnalyzer();
final ArabicAnalyzer arabicAnalyzer = new ArabicAnalyzer();
final BengaliAnalyzer bengaliAnalyzer = new BengaliAnalyzer();
final DanishAnalyzer danishAnalyzer = new DanishAnalyzer();
final DutchAnalyzer dutchAnalyzer = new DutchAnalyzer();
final FinnishAnalyzer finnishAnalyzer = new FinnishAnalyzer();
final FrenchAnalyzer frenchAnalyzer = new FrenchAnalyzer();
final GermanAnalyzer germanAnalyzer = new GermanAnalyzer();
final HindiAnalyzer hindiAnalyzer = new HindiAnalyzer();
final HungarianAnalyzer hungarianAnalyzer = new HungarianAnalyzer();
final IndonesianAnalyzer indonesianAnalyzer = new IndonesianAnalyzer();
final ItalianAnalyzer italianAnalyzer = new ItalianAnalyzer();
final JapaneseAnalyzer japaneseAnalyzer = new JapaneseAnalyzer();
final NorwegianAnalyzer norwegianAnalyzer = new NorwegianAnalyzer();
final PortugueseAnalyzer portugueseAnalyzer = new PortugueseAnalyzer();
final RussianAnalyzer russianAnalyzer = new RussianAnalyzer();
final SpanishAnalyzer spanishAnalyzer = new SpanishAnalyzer();
final SwedishAnalyzer swedishAnalyzer = new SwedishAnalyzer();
final ThaiAnalyzer thaiAnalyzer = new ThaiAnalyzer();
final TurkishAnalyzer turkishAnalyzer = new TurkishAnalyzer();
final WhitespaceAnalyzer whitespaceAnalyzer = new WhitespaceAnalyzer();
final DefaultEnglishAnalyzer analyzer = DefaultEnglishAnalyzer.fromArguments(args.stemmer, args.keepStopwords, args.stopwords);
final TweetAnalyzer tweetAnalyzer = new TweetAnalyzer(args.tweetStemming);
final IndexWriterConfig config;
if (args.collectionClass.equals("TweetCollection")) {
config = new IndexWriterConfig(tweetAnalyzer);
} else if (args.language.equals("ar")) {
config = new IndexWriterConfig(arabicAnalyzer);
} else if (args.language.equals("bn")) {
config = new IndexWriterConfig(bengaliAnalyzer);
} else if (args.language.equals("da")) {
config = new IndexWriterConfig(danishAnalyzer);
} else if (args.language.equals("de")) {
config = new IndexWriterConfig(germanAnalyzer);
} else if (args.language.equals("es")) {
config = new IndexWriterConfig(spanishAnalyzer);
} else if (args.language.equals("fi")) {
config = new IndexWriterConfig(finnishAnalyzer);
} else if (args.language.equals("fr")) {
config = new IndexWriterConfig(frenchAnalyzer);
} else if (args.language.equals("hi")) {
config = new IndexWriterConfig(hindiAnalyzer);
} else if (args.language.equals("hu")) {
config = new IndexWriterConfig(hungarianAnalyzer);
} else if (args.language.equals("id")) {
config = new IndexWriterConfig(indonesianAnalyzer);
} else if (args.language.equals("it")) {
config = new IndexWriterConfig(italianAnalyzer);
} else if (args.language.equals("ja")) {
config = new IndexWriterConfig(japaneseAnalyzer);
} else if (args.language.equals("nl")) {
config = new IndexWriterConfig(dutchAnalyzer);
} else if (args.language.equals("no")) {
config = new IndexWriterConfig(norwegianAnalyzer);
} else if (args.language.equals("pt")) {
config = new IndexWriterConfig(portugueseAnalyzer);
} else if (args.language.equals("ru")) {
config = new IndexWriterConfig(russianAnalyzer);
} else if (args.language.equals("sv")) {
config = new IndexWriterConfig(swedishAnalyzer);
} else if (args.language.equals("th")) {
config = new IndexWriterConfig(thaiAnalyzer);
} else if (args.language.equals("tr")) {
config = new IndexWriterConfig(turkishAnalyzer);
} else if (args.language.equals("zh") || args.language.equals("ko")) {
config = new IndexWriterConfig(chineseAnalyzer);
} else if (args.language.equals("sw") || args.language.equals("te")) {
// For Mr.TyDi: sw and te do not have custom Lucene analyzers, so just use whitespace analyzer.
config = new IndexWriterConfig(whitespaceAnalyzer);
} else if (args.pretokenized) {
config = new IndexWriterConfig(whitespaceAnalyzer);
} else {
config = new IndexWriterConfig(analyzer);
}
if (args.bm25Accurate) {
// necessary during indexing as the norm used in BM25 is already determined at index time.
config.setSimilarity(new AccurateBM25Similarity());
}
if (args.impact) {
config.setSimilarity(new ImpactSimilarity());
} else {
config.setSimilarity(new BM25Similarity());
}
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setRAMBufferSizeMB(args.memorybufferSize);
config.setUseCompoundFile(false);
config.setMergeScheduler(new ConcurrentMergeScheduler());
writer = new IndexWriter(dir, config);
}
final ThreadPoolExecutor executor = (ThreadPoolExecutor) Executors.newFixedThreadPool(numThreads);
LOG.info("Thread pool with " + numThreads + " threads initialized.");
LOG.info("Initializing collection in " + collectionPath.toString());
List<?> segmentPaths = collection.getSegmentPaths();
// when we want sharding to be done
if (args.shardCount > 1) {
segmentPaths = collection.getSegmentPaths(args.shardCount, args.shardCurrent);
}
final int segmentCnt = segmentPaths.size();
LOG.info(String.format("%,d %s found", segmentCnt, (segmentCnt == 1 ? "file" : "files")));
LOG.info("Starting to index...");
for (int i = 0; i < segmentCnt; i++) {
if (args.solr) {
executor.execute(new SolrIndexerThread(collection, (Path) segmentPaths.get(i)));
} else if (args.es) {
executor.execute(new ESIndexerThread(collection, (Path) segmentPaths.get(i)));
} else {
executor.execute(new LocalIndexerThread(writer, collection, (Path) segmentPaths.get(i)));
}
}
executor.shutdown();
try {
// Wait for existing tasks to terminate
while (!executor.awaitTermination(1, TimeUnit.MINUTES)) {
if (segmentCnt == 1) {
LOG.info(String.format("%,d documents indexed", counters.indexed.get()));
} else {
LOG.info(String.format("%.2f%% of files completed, %,d documents indexed", (double) executor.getCompletedTaskCount() / segmentCnt * 100.0d, counters.indexed.get()));
}
}
} catch (InterruptedException ie) {
// (Re-)Cancel if current thread also interrupted
executor.shutdownNow();
// Preserve interrupt status
Thread.currentThread().interrupt();
}
if (segmentCnt != executor.getCompletedTaskCount()) {
throw new RuntimeException("totalFiles = " + segmentCnt + " is not equal to completedTaskCount = " + executor.getCompletedTaskCount());
}
long numIndexed;
if (args.solr || args.es) {
numIndexed = counters.indexed.get();
} else {
numIndexed = writer.getDocStats().maxDoc;
}
// Do a final commit
if (args.solr) {
try {
SolrClient client = solrPool.borrowObject();
client.commit(args.solrIndex);
// Needed for orderly shutdown so the SolrClient executor does not delay main thread exit
solrPool.returnObject(client);
solrPool.close();
} catch (Exception e) {
LOG.error("Exception during final Solr commit: ", e);
}
}
if (args.es) {
esPool.close();
}
try {
if (writer != null) {
writer.commit();
if (args.optimize) {
writer.forceMerge(1);
}
}
} finally {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
// It is possible that this happens... but nothing much we can do at this point,
// so just log the error and move on.
LOG.error(e);
}
}
if (numIndexed != counters.indexed.get()) {
LOG.warn("Unexpected difference between number of indexed documents and index maxDoc.");
}
LOG.info(String.format("Indexing Complete! %,d documents indexed", numIndexed));
LOG.info("============ Final Counter Values ============");
LOG.info(String.format("indexed: %,12d", counters.indexed.get()));
LOG.info(String.format("unindexable: %,12d", counters.unindexable.get()));
LOG.info(String.format("empty: %,12d", counters.empty.get()));
LOG.info(String.format("skipped: %,12d", counters.skipped.get()));
LOG.info(String.format("errors: %,12d", counters.errors.get()));
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info(String.format("Total %,d documents indexed in %s", numIndexed, DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss")));
return counters;
}
use of org.apache.lucene.search.similarities.BM25Similarity in project Anserini by castorini.
the class BasicIndexOperationsTest method testIterateThroughDocumentVectorComputeBM25.
// This test case iterates through all documents in the index and prints out the document vector:
// For each term, we print out the term frequency and the BM25 weight.
@Test
public void testIterateThroughDocumentVectorComputeBM25() throws Exception {
Directory dir = FSDirectory.open(tempDir1);
IndexReader reader = DirectoryReader.open(dir);
IndexSearcher searcher = new IndexSearcher(reader);
searcher.setSimilarity(new BM25Similarity());
int numDocs = reader.numDocs();
// Iterate through the document vectors
for (int i = 0; i < numDocs; i++) {
String docid = reader.document(i).getField("id").stringValue();
System.out.println(reader.document(i));
System.out.println(i + ": " + docid);
Terms terms = reader.getTermVector(i, "contents");
TermsEnum te = terms.iterator();
// For this document, iterate through the terms.
while (te.next() != null) {
String term = new Term("contents", te.term()).bytes().utf8ToString();
long tf = te.totalTermFreq();
// The way to compute the BM25 score is to issue a query with the exact docid and the
// term in question, and look at the retrieval score.
// the docid
Query filterQuery = new ConstantScoreQuery(new TermQuery(new Term("id", docid)));
// the term
Query termQuery = new TermQuery(new Term("contents", term));
// must have both
BooleanQuery.Builder builder = new BooleanQuery.Builder();
builder.add(filterQuery, BooleanClause.Occur.MUST);
builder.add(termQuery, BooleanClause.Occur.MUST);
Query finalQuery = builder.build();
// issue the query
TopDocs rs = searcher.search(finalQuery, 1);
// The BM25 weight is the maxScore
System.out.println(term + " " + tf + " " + (rs.scoreDocs.length == 0 ? Float.NaN : rs.scoreDocs[0].score - 1));
}
}
}
use of org.apache.lucene.search.similarities.BM25Similarity in project elasticsearch by elastic.
the class SimilarityTests method testResolveSimilaritiesFromMapping_bm25.
public void testResolveSimilaritiesFromMapping_bm25() throws IOException {
String mapping = XContentFactory.jsonBuilder().startObject().startObject("type").startObject("properties").startObject("field1").field("type", "text").field("similarity", "my_similarity").endObject().endObject().endObject().endObject().string();
Settings indexSettings = Settings.builder().put("index.similarity.my_similarity.type", "BM25").put("index.similarity.my_similarity.k1", 2.0f).put("index.similarity.my_similarity.b", 0.5f).put("index.similarity.my_similarity.discount_overlaps", false).build();
IndexService indexService = createIndex("foo", indexSettings);
DocumentMapper documentMapper = indexService.mapperService().documentMapperParser().parse("type", new CompressedXContent(mapping));
assertThat(documentMapper.mappers().getMapper("field1").fieldType().similarity(), instanceOf(BM25SimilarityProvider.class));
BM25Similarity similarity = (BM25Similarity) documentMapper.mappers().getMapper("field1").fieldType().similarity().get();
assertThat(similarity.getK1(), equalTo(2.0f));
assertThat(similarity.getB(), equalTo(0.5f));
assertThat(similarity.getDiscountOverlaps(), equalTo(false));
}
Aggregations