use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints query results to the standard output stream.
*
* @param queryName the entity name to search
* @throws Exception on error
*/
public void search(String queryName) throws Exception {
LOG.info("Querying started...");
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
SimpleAnalyzer analyzer = new SimpleAnalyzer();
int numHits = 20;
// find exact title
QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
Query titleQuery = titleParser.parse(queryName);
TopDocs rs = searcher.search(titleQuery, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact WIKI_TITLE found! Ending search.");
return;
} else {
System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
}
System.out.println();
// find exact label
QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
Query labelQuery = labelParser.parse(queryName);
rs = searcher.search(labelQuery, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
if (docs.documents.length != 0) {
System.out.println("Exact W3_LABEL found! Ending search.");
return;
} else {
System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
}
System.out.println();
float k1 = 1.5f;
float b = 0.75f;
Similarity similarity = new BM25Similarity(k1, b);
searcher.setSimilarity(similarity);
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
rs = searcher.search(query, numHits);
docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
LOG.info("Querying completed.");
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.
the class EntityLinking method exactQuerySearch.
/**
* Returns a list of query results.
*
* @param queryName the entity name to search
* @throws Exception on error
* @return a list of top ranked entities
*/
public List<RankedEntity> exactQuerySearch(String queryName, int numHits) throws Exception {
List<RankedEntity> rankedEntities = new ArrayList<>();
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// do exact search on query name
QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
queryParser.setAutoGeneratePhraseQueries(true);
queryParser.setPhraseSlop(3);
queryName = "\"" + queryName + "\"";
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
float score = docs.scores[i];
String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
String shortMid = getShortMid(mid);
String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
rankedEntities.add(new RankedEntity(shortMid, score, name, label));
}
return rankedEntities;
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.
the class IndexTopics method run.
public void run() throws IOException, InterruptedException {
final long start = System.nanoTime();
LOG.info("Starting indexer...");
final Directory dir = FSDirectory.open(indexPath);
final SimpleAnalyzer analyzer = new SimpleAnalyzer();
final IndexWriterConfig config = new IndexWriterConfig(analyzer);
config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
config.setCodec(new Lucene62Codec(Lucene50StoredFieldsFormat.Mode.BEST_SPEED));
config.setUseCompoundFile(false);
final IndexWriter writer = new IndexWriter(dir, config);
final AtomicInteger cnt = new AtomicInteger();
new Freebase(inputPath).stream().map(new TopicLuceneDocumentGenerator()).forEach(doc -> {
try {
writer.addDocument(doc);
int cur = cnt.incrementAndGet();
if (cur % 10000000 == 0) {
LOG.info(cnt + " nodes added.");
}
} catch (IOException e) {
LOG.error(e);
}
});
LOG.info(cnt.get() + " nodes added.");
int numIndexed = writer.maxDoc();
try {
writer.commit();
} finally {
try {
writer.close();
} catch (IOException e) {
LOG.error(e);
}
}
long duration = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(duration, "HH:mm:ss"));
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.
the class LookupTopic method search.
/**
* Prints all known facts about a particular mid.
* @param queryName query topic name
* @throws Exception on error
*/
public void search(String queryName, int numHits) throws Exception {
// Initialize index searcher
IndexSearcher searcher = new IndexSearcher(reader);
// search for query in multiple fields
MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL, IndexTopics.FIELD_ALIAS }, new SimpleAnalyzer());
queryParser.setDefaultOperator(QueryParser.Operator.OR);
Query query = queryParser.parse(queryName);
TopDocs rs = searcher.search(query, numHits);
ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
for (int i = 0; i < docs.documents.length; i++) {
String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nOBJECT_NAME: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_ALIAS).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue());
System.out.println(resultDoc);
}
}
use of org.apache.lucene.analysis.core.SimpleAnalyzer in project hmftools by hartwigmedical.
the class TreatmentCurator method createIndexSpellchecker.
@NotNull
private static SpellChecker createIndexSpellchecker(@NotNull final Directory index) throws IOException {
final Directory spellCheckerDirectory = new RAMDirectory();
final IndexReader indexReader = DirectoryReader.open(index);
final Analyzer analyzer = new SimpleAnalyzer();
final IndexWriterConfig config = new IndexWriterConfig(analyzer);
final Dictionary dictionary = new HighFrequencyDictionary(indexReader, DRUG_TERMS_FIELD, 0.0f);
final SpellChecker spellChecker = new SpellChecker(spellCheckerDirectory);
spellChecker.indexDictionary(dictionary, config, false);
spellChecker.setAccuracy(SPELLCHECK_ACCURACY);
return spellChecker;
}
Aggregations