Search in sources :

Example 1 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.

the class LookupTopic method search.

/**
   * Prints query results to the standard output stream.
   *
   * @param queryName the entity name to search
   * @throws Exception on error
   */
public void search(String queryName) throws Exception {
    LOG.info("Querying started...");
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    SimpleAnalyzer analyzer = new SimpleAnalyzer();
    int numHits = 20;
    // find exact title
    QueryParser titleParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_TITLE, analyzer);
    Query titleQuery = titleParser.parse(queryName);
    TopDocs rs = searcher.search(titleQuery, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact WIKI_TITLE found! Ending search.");
        return;
    } else {
        System.out.println("Exact WIKI_TITLE not found. Searching for the label...");
    }
    System.out.println();
    // find exact label
    QueryParser labelParser = new QueryParser(TopicLuceneDocumentGenerator.FIELD_LABEL, analyzer);
    Query labelQuery = labelParser.parse(queryName);
    rs = searcher.search(labelQuery, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    if (docs.documents.length != 0) {
        System.out.println("Exact W3_LABEL found! Ending search.");
        return;
    } else {
        System.out.println("Exact W3_LABEL not found. Ranking the topics using BM25 according the text/title/label...");
    }
    System.out.println();
    float k1 = 1.5f;
    float b = 0.75f;
    Similarity similarity = new BM25Similarity(k1, b);
    searcher.setSimilarity(similarity);
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { TopicLuceneDocumentGenerator.FIELD_TITLE, TopicLuceneDocumentGenerator.FIELD_LABEL, TopicLuceneDocumentGenerator.FIELD_TEXT }, analyzer);
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    rs = searcher.search(query, numHits);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_TITLE).stringValue(), docs.documents[i].getField(TopicLuceneDocumentGenerator.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
    LOG.info("Querying completed.");
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) Similarity(org.apache.lucene.search.similarities.Similarity) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments) BM25Similarity(org.apache.lucene.search.similarities.BM25Similarity)

Example 2 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.

the class EntityLinking method exactQuerySearch.

/**
 * Returns a list of query results.
 *
 * @param queryName the entity name to search
 * @throws Exception on error
 * @return a list of top ranked entities
 */
public List<RankedEntity> exactQuerySearch(String queryName, int numHits) throws Exception {
    List<RankedEntity> rankedEntities = new ArrayList<>();
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // do exact search on query name
    QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
    queryParser.setAutoGeneratePhraseQueries(true);
    queryParser.setPhraseSlop(3);
    queryName = "\"" + queryName + "\"";
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        float score = docs.scores[i];
        String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
        String shortMid = getShortMid(mid);
        String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
        String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
        rankedEntities.add(new RankedEntity(shortMid, score, name, label));
    }
    return rankedEntities;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ArrayList(java.util.ArrayList) ScoredDocuments(io.anserini.rerank.ScoredDocuments)

Example 3 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.

the class IndexTopics method run.

public void run() throws IOException, InterruptedException {
    final long start = System.nanoTime();
    LOG.info("Starting indexer...");
    final Directory dir = FSDirectory.open(indexPath);
    final SimpleAnalyzer analyzer = new SimpleAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    config.setOpenMode(IndexWriterConfig.OpenMode.CREATE);
    config.setCodec(new Lucene62Codec(Lucene50StoredFieldsFormat.Mode.BEST_SPEED));
    config.setUseCompoundFile(false);
    final IndexWriter writer = new IndexWriter(dir, config);
    final AtomicInteger cnt = new AtomicInteger();
    new Freebase(inputPath).stream().map(new TopicLuceneDocumentGenerator()).forEach(doc -> {
        try {
            writer.addDocument(doc);
            int cur = cnt.incrementAndGet();
            if (cur % 10000000 == 0) {
                LOG.info(cnt + " nodes added.");
            }
        } catch (IOException e) {
            LOG.error(e);
        }
    });
    LOG.info(cnt.get() + " nodes added.");
    int numIndexed = writer.maxDoc();
    try {
        writer.commit();
    } finally {
        try {
            writer.close();
        } catch (IOException e) {
            LOG.error(e);
        }
    }
    long duration = TimeUnit.MILLISECONDS.convert(System.nanoTime() - start, TimeUnit.NANOSECONDS);
    LOG.info("Total " + numIndexed + " documents indexed in " + DurationFormatUtils.formatDuration(duration, "HH:mm:ss"));
}
Also used : SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) IndexWriter(org.apache.lucene.index.IndexWriter) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) IOException(java.io.IOException) Lucene62Codec(org.apache.lucene.codecs.lucene62.Lucene62Codec) Directory(org.apache.lucene.store.Directory) FSDirectory(org.apache.lucene.store.FSDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 4 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.

the class LookupTopic method search.

/**
 * Prints all known facts about a particular mid.
 * @param queryName query topic name
 * @throws Exception on error
 */
public void search(String queryName, int numHits) throws Exception {
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // search for query in multiple fields
    MultiFieldQueryParser queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL, IndexTopics.FIELD_ALIAS }, new SimpleAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.OR);
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        String resultDoc = String.format("%d - SCORE: %f\nTOPIC_MID: %s\nOBJECT_NAME: %s\nWIKI_TITLE: %s\nW3_LABEL: %s\n", (i + 1), docs.scores[i], docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_ALIAS).stringValue(), docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue());
        System.out.println(resultDoc);
    }
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) Query(org.apache.lucene.search.Query) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ScoredDocuments(io.anserini.rerank.ScoredDocuments)

Example 5 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project hmftools by hartwigmedical.

the class TreatmentCurator method createIndexSpellchecker.

@NotNull
private static SpellChecker createIndexSpellchecker(@NotNull final Directory index) throws IOException {
    final Directory spellCheckerDirectory = new RAMDirectory();
    final IndexReader indexReader = DirectoryReader.open(index);
    final Analyzer analyzer = new SimpleAnalyzer();
    final IndexWriterConfig config = new IndexWriterConfig(analyzer);
    final Dictionary dictionary = new HighFrequencyDictionary(indexReader, DRUG_TERMS_FIELD, 0.0f);
    final SpellChecker spellChecker = new SpellChecker(spellCheckerDirectory);
    spellChecker.indexDictionary(dictionary, config, false);
    spellChecker.setAccuracy(SPELLCHECK_ACCURACY);
    return spellChecker;
}
Also used : Dictionary(org.apache.lucene.search.spell.Dictionary) HighFrequencyDictionary(org.apache.lucene.search.spell.HighFrequencyDictionary) HighFrequencyDictionary(org.apache.lucene.search.spell.HighFrequencyDictionary) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) IndexReader(org.apache.lucene.index.IndexReader) SpellChecker(org.apache.lucene.search.spell.SpellChecker) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) RAMDirectory(org.apache.lucene.store.RAMDirectory) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) NotNull(org.jetbrains.annotations.NotNull)

Aggregations

SimpleAnalyzer (org.apache.lucene.analysis.core.SimpleAnalyzer)15 IndexSearcher (org.apache.lucene.search.IndexSearcher)7 Analyzer (org.apache.lucene.analysis.Analyzer)6 Document (org.apache.lucene.document.Document)6 TopDocs (org.apache.lucene.search.TopDocs)6 Field (org.apache.lucene.document.Field)5 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 Query (org.apache.lucene.search.Query)5 RAMDirectory (org.apache.lucene.store.RAMDirectory)5 ScoredDocuments (io.anserini.rerank.ScoredDocuments)4 ArrayList (java.util.ArrayList)4 TextField (org.apache.lucene.document.TextField)4 IndexWriter (org.apache.lucene.index.IndexWriter)4 MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)4 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)4 Test (org.junit.Test)4 HashMap (java.util.HashMap)3 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)3 TermQuery (org.apache.lucene.search.TermQuery)3 SKOSAnalyzer (at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer)2