use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.
the class PyseriniEntryPoint method search.
public Map<String, Float> search(String query, int numHits) throws IOException, ParseException {
// for now, using BM25 similarity - not branching on args.bm25 or args.ql
float k1 = 0.9f;
float b = 0.4f;
Similarity similarity = new BM25Similarity(k1, b);
// for now, creating Topics map and appending query and setting id=1
SortedMap<Integer, String> topics = new TreeMap<>();
int id = 1;
topics.put(id, query);
// for now, using IdentityReranker - not branching on args.rm3
RerankerCascade cascade = new RerankerCascade();
cascade.add(new IdentityReranker());
Map<String, Float> scoredDocs = search(topics, similarity, numHits, cascade, false, false);
return scoredDocs;
}
use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints query results to the standard output stream.
*
* @param queryName the entity name to search
* @throws Exception on error
*/
public void search(String queryName) throws Exception {
LOG.info("Querying started...");
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
SimpleAnalyzer analyzer = new SimpleAnalyzer();
int numHits = 20;
// find exact title
QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
Query titleQuery = titleParser.parse(queryName);
TopDocs rs = searcher.search(titleQuery, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact WIKI_TITLE found! Ending search.");
return;
} else {
System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
}
System.out.println();
// find exact label
QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
Query labelQuery = labelParser.parse(queryName);
rs = searcher.search(labelQuery, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact W3_LABEL found! Ending search.");
return;
} else {
System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
}
System.out.println();
float k1 = 1.5f;
float b = 0.75f;
Similarity similarity = new BM25Similarity(k1, b);
searcher.setSimilarity(similarity);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
rs = searcher.search(query, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
LOG.info("Querying completed.");
}
use of org.apache.lucene.search.similarities.Similarity in project Anserini by castorini.
the class SearchWebCollection method main.
public static void main(String[] args) throws Exception {
SearchArgs searchArgs = new SearchArgs();
CmdLineParser parser = new CmdLineParser(searchArgs, ParserProperties.defaults().withUsageWidth(90));
try {
parser.parseArgument(args);
} catch (CmdLineException e) {
System.err.println(e.getMessage());
parser.printUsage(System.err);
System.err.println("Example: SearchWebCollection" + parser.printExample(OptionHandlerFilter.REQUIRED));
return;
}
LOG.info("Reading index at " + searchArgs.index);
Directory dir;
if (searchArgs.inmem) {
LOG.info("Using MMapDirectory with preload");
dir = new MMapDirectory(Paths.get(searchArgs.index));
((MMapDirectory) dir).setPreload(true);
} else {
LOG.info("Using default FSDirectory");
dir = FSDirectory.open(Paths.get(searchArgs.index));
}
Similarity similarity = null;
if (searchArgs.ql) {
LOG.info("Using QL scoring model");
similarity = new LMDirichletSimilarity(searchArgs.mu);
} else if (searchArgs.bm25) {
LOG.info("Using BM25 scoring model");
similarity = new BM25Similarity(searchArgs.k1, searchArgs.b);
} else {
LOG.error("Error: Must specify scoring model!");
System.exit(-1);
}
RerankerCascade cascade = new RerankerCascade();
boolean useQueryParser = false;
if (searchArgs.rm3) {
cascade.add(new Rm3Reranker(new EnglishAnalyzer(), FIELD_BODY, "src/main/resources/io/anserini/rerank/rm3/rm3-stoplist.gov2.txt"));
useQueryParser = true;
} else {
cascade.add(new IdentityReranker());
}
FeatureExtractors extractors = null;
if (searchArgs.extractors != null) {
extractors = FeatureExtractors.loadExtractor(searchArgs.extractors);
}
if (searchArgs.dumpFeatures) {
PrintStream out = new PrintStream(searchArgs.featureFile);
Qrels qrels = new Qrels(searchArgs.qrels);
cascade.add(new WebCollectionLtrDataGenerator(out, qrels, extractors));
}
Path topicsFile = Paths.get(searchArgs.topics);
if (!Files.exists(topicsFile) || !Files.isRegularFile(topicsFile) || !Files.isReadable(topicsFile)) {
throw new IllegalArgumentException("Topics file : " + topicsFile + " does not exist or is not a (readable) file.");
}
TopicReader tr = (TopicReader) Class.forName("io.anserini.search.query." + searchArgs.topicReader + "TopicReader").getConstructor(Path.class).newInstance(topicsFile);
SortedMap<Integer, String> topics = tr.read();
final long start = System.nanoTime();
SearchWebCollection searcher = new SearchWebCollection(searchArgs.index);
searcher.search(topics, searchArgs.output, similarity, searchArgs.hits, cascade, useQueryParser, searchArgs.keepstop);
searcher.close();
final long durationMillis = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info("Total " + topics.size() + " topics searched in " + DurationFormatUtils.formatDuration(durationMillis, "HH:mm:ss"));
}
use of org.apache.lucene.search.similarities.Similarity in project elasticsearch by elastic.
the class AllTermQuery method createWeight.
@Override
public Weight createWeight(IndexSearcher searcher, boolean needsScores) throws IOException {
if (needsScores == false) {
return new TermQuery(term).createWeight(searcher, needsScores);
}
final TermContext termStates = TermContext.build(searcher.getTopReaderContext(), term);
final CollectionStatistics collectionStats = searcher.collectionStatistics(term.field());
final TermStatistics termStats = searcher.termStatistics(term, termStates);
final Similarity similarity = searcher.getSimilarity(needsScores);
final SimWeight stats = similarity.computeWeight(collectionStats, termStats);
return new Weight(this) {
@Override
public float getValueForNormalization() throws IOException {
return stats.getValueForNormalization();
}
@Override
public void normalize(float norm, float topLevelBoost) {
stats.normalize(norm, topLevelBoost);
}
@Override
public void extractTerms(Set<Term> terms) {
terms.add(term);
}
@Override
public Explanation explain(LeafReaderContext context, int doc) throws IOException {
AllTermScorer scorer = scorer(context);
if (scorer != null) {
int newDoc = scorer.iterator().advance(doc);
if (newDoc == doc) {
float score = scorer.score();
float freq = scorer.freq();
SimScorer docScorer = similarity.simScorer(stats, context);
Explanation freqExplanation = Explanation.match(freq, "termFreq=" + freq);
Explanation termScoreExplanation = docScorer.explain(doc, freqExplanation);
Explanation payloadBoostExplanation = Explanation.match(scorer.payloadBoost(), "payloadBoost=" + scorer.payloadBoost());
return Explanation.match(score, "weight(" + getQuery() + " in " + doc + ") [" + similarity.getClass().getSimpleName() + "], product of:", termScoreExplanation, payloadBoostExplanation);
}
}
return Explanation.noMatch("no matching term");
}
@Override
public AllTermScorer scorer(LeafReaderContext context) throws IOException {
final Terms terms = context.reader().terms(term.field());
if (terms == null) {
return null;
}
final TermsEnum termsEnum = terms.iterator();
if (termsEnum == null) {
return null;
}
final TermState state = termStates.get(context.ord);
if (state == null) {
// Term does not exist in this segment
return null;
}
termsEnum.seekExact(term.bytes(), state);
PostingsEnum docs = termsEnum.postings(null, PostingsEnum.PAYLOADS);
assert docs != null;
return new AllTermScorer(this, docs, similarity.simScorer(stats, context));
}
};
}
use of org.apache.lucene.search.similarities.Similarity in project elasticsearch by elastic.
the class IndexModuleTests method testAddSimilarity.
public void testAddSimilarity() throws IOException {
Settings indexSettings = Settings.builder().put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT).put("index.similarity.my_similarity.type", "test_similarity").put("index.similarity.my_similarity.key", "there is a key").put(Environment.PATH_HOME_SETTING.getKey(), createTempDir().toString()).build();
IndexModule module = new IndexModule(IndexSettingsModule.newIndexSettings("foo", indexSettings), new AnalysisRegistry(environment, emptyMap(), emptyMap(), emptyMap(), emptyMap(), emptyMap()));
module.addSimilarity("test_similarity", (string, providerSettings, indexLevelSettings) -> new SimilarityProvider() {
@Override
public String name() {
return string;
}
@Override
public Similarity get() {
return new TestSimilarity(providerSettings.get("key"));
}
});
IndexService indexService = newIndexService(module);
SimilarityService similarityService = indexService.similarityService();
assertNotNull(similarityService.getSimilarity("my_similarity"));
assertTrue(similarityService.getSimilarity("my_similarity").get() instanceof TestSimilarity);
assertEquals("my_similarity", similarityService.getSimilarity("my_similarity").name());
assertEquals("there is a key", ((TestSimilarity) similarityService.getSimilarity("my_similarity").get()).key);
indexService.close("simon says", false);
}
Aggregations