Search in sources :

Example 11 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project camel by apache.

the class LuceneIndexAndQueryProducerTest method createRegistry.

@Override
protected JndiRegistry createRegistry() throws Exception {
    JndiRegistry registry = new JndiRegistry(createJndiContext());
    registry.bind("std", new File("target/stdindexDir"));
    registry.bind("load_dir", new File("src/test/resources/sources"));
    registry.bind("stdAnalyzer", new StandardAnalyzer());
    registry.bind("simple", new File("target/simpleindexDir"));
    registry.bind("simpleAnalyzer", new SimpleAnalyzer());
    registry.bind("whitespace", new File("target/whitespaceindexDir"));
    registry.bind("whitespaceAnalyzer", new WhitespaceAnalyzer());
    return registry;
}
Also used : JndiRegistry(org.apache.camel.impl.JndiRegistry) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) File(java.io.File)

Example 12 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project Anserini by castorini.

the class EntityLinking method search.

/**
 * Returns a list of query results.
 *
 * @param queryName the entity name to search
 * @throws Exception on error
 * @return a list of top ranked entities
 */
public List<RankedEntity> search(String queryName, int numHits) throws Exception {
    List<RankedEntity> rankedEntities = new ArrayList<>();
    // Initialize index searcher
    IndexSearcher searcher = new IndexSearcher(reader);
    // do exact search on query name
    QueryParser queryParser = new QueryParser(IndexTopics.FIELD_NAME, new SimpleAnalyzer());
    queryParser.setAutoGeneratePhraseQueries(true);
    queryParser.setPhraseSlop(3);
    queryName = "\"" + queryName + "\"";
    Query query = queryParser.parse(queryName);
    TopDocs rs = searcher.search(query, numHits);
    ScoredDocuments docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        float score = docs.scores[i];
        String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
        String shortMid = getShortMid(mid);
        String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
        String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
        rankedEntities.add(new RankedEntity(shortMid, score, name, label));
    }
    if (docs.documents.length >= numHits) {
        return rankedEntities;
    }
    int numHitsLeft = numHits - docs.documents.length;
    // do TFIDF search
    Similarity similarity = new ClassicSimilarity();
    searcher.setSimilarity(similarity);
    queryParser = new MultiFieldQueryParser(new String[] { IndexTopics.FIELD_NAME, IndexTopics.FIELD_LABEL }, new SimpleAnalyzer());
    queryParser.setDefaultOperator(QueryParser.Operator.AND);
    query = queryParser.parse(queryName);
    rs = searcher.search(query, numHitsLeft);
    docs = ScoredDocuments.fromTopDocs(rs, searcher);
    for (int i = 0; i < docs.documents.length; i++) {
        float score = docs.scores[i];
        String mid = docs.documents[i].getField(IndexTopics.FIELD_TOPIC_MID).stringValue();
        String shortMid = getShortMid(mid);
        String name = docs.documents[i].getField(IndexTopics.FIELD_NAME).stringValue();
        String label = docs.documents[i].getField(IndexTopics.FIELD_LABEL).stringValue();
        rankedEntities.add(new RankedEntity(shortMid, score, name, label));
    }
    return rankedEntities;
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) Query(org.apache.lucene.search.Query) TermQuery(org.apache.lucene.search.TermQuery) Similarity(org.apache.lucene.search.similarities.Similarity) ClassicSimilarity(org.apache.lucene.search.similarities.ClassicSimilarity) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) ArrayList(java.util.ArrayList) ScoredDocuments(io.anserini.rerank.ScoredDocuments) TopDocs(org.apache.lucene.search.TopDocs) MultiFieldQueryParser(org.apache.lucene.queryparser.classic.MultiFieldQueryParser) QueryParser(org.apache.lucene.queryparser.classic.QueryParser)

Example 13 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.

the class URIbasedTermExpansionTest method uriBasedTermExpansion.

/**
 * This test indexes a sample metadata record (=lucene document) having a
 * "title", "description", and "subject" field, which is semantically
 * enriched by a URI pointing to a SKOS concept "weapons".
 * <p/>
 * A search for "arms" returns that record as a result because "arms" is
 * defined as an alternative label (altLabel) for the concept "weapons".
 *
 * @throws IOException
 */
@Test
public void uriBasedTermExpansion() throws IOException {
    /* defining the document to be indexed */
    Document doc = new Document();
    doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
    doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
    doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED));
    /* setting up the SKOS analyzer */
    String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
    String indexPath = "build/";
    /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
    Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI);
    /* Define different analyzers for different fields */
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("subject", skosAnalyzer);
    PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
    /* setting up a writer with a default (simple) analyzer */
    writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
    /* adding the document to the index */
    writer.addDocument(doc);
    /* defining a query that searches over all fields */
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
    /* creating a new searcher */
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));
    TopDocs results = searcher.search(builder.build(), 10);
    /* the document matches because "arms" is among the expanded terms */
    assertEquals(1, results.totalHits);
    /* defining a query that searches for a broader concept */
    Query query = new TermQuery(new Term("subject", "military equipment"));
    results = searcher.search(query, 10);
    /* ... also returns the document as result */
    assertEquals(1, results.totalHits);
}
Also used : HashMap(java.util.HashMap) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) SKOSAnalyzer(at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) SKOSAnalyzer(at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Example 14 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.

the class SKOSLabelFilterTest method testTermQuery.

@Test
public void testTermQuery() throws IOException, QueryNodeException {
    Document doc = new Document();
    doc.add(new Field("content", "I work for the united nations", TextField.TYPE_STORED));
    writer.addDocument(doc);
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));
    StandardQueryParser parser = new StandardQueryParser(new SimpleAnalyzer());
    Query query = parser.parse("united nations", "content");
    assertEquals(1, searcher.search(query, 1).totalHits);
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) Query(org.apache.lucene.search.Query) PhraseQuery(org.apache.lucene.search.PhraseQuery) TermQuery(org.apache.lucene.search.TermQuery) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) Document(org.apache.lucene.document.Document) StandardQueryParser(org.apache.lucene.queryparser.flexible.standard.StandardQueryParser) Test(org.junit.Test)

Example 15 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.

the class LabelbasedTermExpansionTest method labelBasedTermExpansion.

/**
 * This test indexes a sample metadata record (=lucene document) having a
 * "title", "description", and "subject" field.
 * <p/>
 * A search for "arms" returns that record as a result because "arms" is
 * defined as an alternative label for "weapons", the term which is
 * contained in the subject field.
 *
 * @throws IOException
 */
@Test
public void labelBasedTermExpansion() throws IOException {
    /* defining the document to be indexed */
    Document doc = new Document();
    doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
    doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
    doc.add(new Field("subject", "weapons", TextField.TYPE_NOT_STORED));
    /* setting up the SKOS analyzer */
    String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
    String indexPath = "build/";
    /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
    Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.LABEL);
    /* Define different analyzers for different fields */
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("subject", skosAnalyzer);
    PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
    /* setting up a writer with a default (simple) analyzer */
    writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
    /* adding the document to the index */
    writer.addDocument(doc);
    /* defining a query that searches over all fields */
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
    /* creating a new searcher */
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));
    TopDocs results = searcher.search(builder.build(), 10);
    /* the document matches because "arms" is among the expanded terms */
    assertEquals(1, results.totalHits);
    /* defining a query that searches for a broader concept */
    Query query = new TermQuery(new Term("subject", "military equipment"));
    results = searcher.search(query, 10);
    /* ... also returns the document as result */
    assertEquals(1, results.totalHits);
}
Also used : HashMap(java.util.HashMap) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) SKOSAnalyzer(at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) SKOSAnalyzer(at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Aggregations

SimpleAnalyzer (org.apache.lucene.analysis.core.SimpleAnalyzer)15 IndexSearcher (org.apache.lucene.search.IndexSearcher)7 Analyzer (org.apache.lucene.analysis.Analyzer)6 Document (org.apache.lucene.document.Document)6 TopDocs (org.apache.lucene.search.TopDocs)6 Field (org.apache.lucene.document.Field)5 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)5 Query (org.apache.lucene.search.Query)5 RAMDirectory (org.apache.lucene.store.RAMDirectory)5 ScoredDocuments (io.anserini.rerank.ScoredDocuments)4 ArrayList (java.util.ArrayList)4 TextField (org.apache.lucene.document.TextField)4 IndexWriter (org.apache.lucene.index.IndexWriter)4 MultiFieldQueryParser (org.apache.lucene.queryparser.classic.MultiFieldQueryParser)4 QueryParser (org.apache.lucene.queryparser.classic.QueryParser)4 Test (org.junit.Test)4 HashMap (java.util.HashMap)3 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)3 TermQuery (org.apache.lucene.search.TermQuery)3 SKOSAnalyzer (at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer)2