Search in sources :

Example 6 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-skos by behas.

the class URIbasedTermExpansionTest method uriBasedTermExpansion.

/**
     * This test indexes a sample metadata record (=lucene document) having a
     * "title", "description", and "subject" field, which is semantically
     * enriched by a URI pointing to a SKOS concept "weapons".
     * <p/>
     * A search for "arms" returns that record as a result because "arms" is
     * defined as an alternative label (altLabel) for the concept "weapons".
     *
     * @throws IOException
     */
@Test
public void uriBasedTermExpansion() throws IOException {
    /* defining the document to be indexed */
    Document doc = new Document();
    doc.add(new Field("title", "Spearhead", TextField.TYPE_STORED));
    doc.add(new Field("description", "Roman iron spearhead. The spearhead was attached to one end of a wooden shaft..." + "The spear was mainly a thrusting weapon, but could also be thrown. " + "It was the principal weapon of the auxiliary soldier... " + "(second - fourth century, Arbeia Roman Fort).", TextField.TYPE_NOT_STORED));
    doc.add(new Field("subject", "http://www.ukat.org.uk/thesaurus/concept/859", TextField.TYPE_NOT_STORED));
    /* setting up the SKOS analyzer */
    String skosFile = "src/test/resources/skos_samples/ukat_examples.n3";
    String indexPath = "build/";
    /* ExpansionType.URI->the field to be analyzed (expanded) contains URIs */
    Analyzer skosAnalyzer = new SKOSAnalyzer(indexPath, skosFile, ExpansionType.URI);
    /* Define different analyzers for different fields */
    Map<String, Analyzer> analyzerPerField = new HashMap<>();
    analyzerPerField.put("subject", skosAnalyzer);
    PerFieldAnalyzerWrapper indexAnalyzer = new PerFieldAnalyzerWrapper(new SimpleAnalyzer(), analyzerPerField);
    /* setting up a writer with a default (simple) analyzer */
    writer = new IndexWriter(new RAMDirectory(), new IndexWriterConfig(indexAnalyzer));
    /* adding the document to the index */
    writer.addDocument(doc);
    /* defining a query that searches over all fields */
    BooleanQuery.Builder builder = new BooleanQuery.Builder();
    builder.add(new TermQuery(new Term("title", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("description", "arms")), BooleanClause.Occur.SHOULD).add(new TermQuery(new Term("subject", "arms")), BooleanClause.Occur.SHOULD);
    /* creating a new searcher */
    searcher = new IndexSearcher(DirectoryReader.open(writer, false));
    TopDocs results = searcher.search(builder.build(), 10);
    /* the document matches because "arms" is among the expanded terms */
    assertEquals(1, results.totalHits);
    /* defining a query that searches for a broader concept */
    Query query = new TermQuery(new Term("subject", "military equipment"));
    results = searcher.search(query, 10);
    /* ... also returns the document as result */
    assertEquals(1, results.totalHits);
}
Also used : HashMap(java.util.HashMap) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) Term(org.apache.lucene.index.Term) Document(org.apache.lucene.document.Document) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) SKOSAnalyzer(at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer) RAMDirectory(org.apache.lucene.store.RAMDirectory) PerFieldAnalyzerWrapper(org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper) Field(org.apache.lucene.document.Field) TextField(org.apache.lucene.document.TextField) IndexWriter(org.apache.lucene.index.IndexWriter) SKOSAnalyzer(at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig) Test(org.junit.Test)

Example 7 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project camel by apache.

the class LuceneIndexAndQueryProducerTest method createRegistry.

@Override
protected JndiRegistry createRegistry() throws Exception {
    JndiRegistry registry = new JndiRegistry(createJndiContext());
    registry.bind("std", new File("target/stdindexDir"));
    registry.bind("load_dir", new File("src/test/resources/sources"));
    registry.bind("stdAnalyzer", new StandardAnalyzer());
    registry.bind("simple", new File("target/simpleindexDir"));
    registry.bind("simpleAnalyzer", new SimpleAnalyzer());
    registry.bind("whitespace", new File("target/whitespaceindexDir"));
    registry.bind("whitespaceAnalyzer", new WhitespaceAnalyzer());
    return registry;
}
Also used : JndiRegistry(org.apache.camel.impl.JndiRegistry) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) StandardAnalyzer(org.apache.lucene.analysis.standard.StandardAnalyzer) File(java.io.File)

Example 8 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-solr by apache.

the class TestSort method testSort.

public void testSort() throws Exception {
    Directory dir = new RAMDirectory();
    Field f = new StringField("f", "0", Field.Store.NO);
    Field f2 = new StringField("f2", "0", Field.Store.NO);
    for (int iterCnt = 0; iterCnt < iter; iterCnt++) {
        IndexWriter iw = new IndexWriter(dir, new IndexWriterConfig(new SimpleAnalyzer()).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
        final MyDoc[] mydocs = new MyDoc[ndocs];
        int v1EmptyPercent = 50;
        int v2EmptyPercent = 50;
        int commitCountdown = commitCount;
        for (int i = 0; i < ndocs; i++) {
            MyDoc mydoc = new MyDoc();
            mydoc.doc = i;
            mydocs[i] = mydoc;
            Document document = new Document();
            if (r.nextInt(100) < v1EmptyPercent) {
                mydoc.val = Integer.toString(r.nextInt(maxval));
                f.setStringValue(mydoc.val);
                document.add(f);
            }
            if (r.nextInt(100) < v2EmptyPercent) {
                mydoc.val2 = Integer.toString(r.nextInt(maxval));
                f2.setStringValue(mydoc.val2);
                document.add(f2);
            }
            iw.addDocument(document);
            if (--commitCountdown <= 0) {
                commitCountdown = commitCount;
                iw.commit();
            }
        }
        iw.close();
        Map<String, UninvertingReader.Type> mapping = new HashMap<>();
        mapping.put("f", UninvertingReader.Type.SORTED);
        mapping.put("f2", UninvertingReader.Type.SORTED);
        DirectoryReader reader = UninvertingReader.wrap(DirectoryReader.open(dir), mapping);
        IndexSearcher searcher = new IndexSearcher(reader);
        // System.out.println("segments="+searcher.getIndexReader().getSequentialSubReaders().length);
        assertTrue(reader.leaves().size() > 1);
        for (int i = 0; i < qiter; i++) {
            Filter filt = new Filter() {

                @Override
                public DocIdSet getDocIdSet(LeafReaderContext context, Bits acceptDocs) {
                    return BitsFilteredDocIdSet.wrap(randSet(context.reader().maxDoc()), acceptDocs);
                }

                @Override
                public String toString(String field) {
                    return "TestSortFilter";
                }

                @Override
                public boolean equals(Object other) {
                    return other == this;
                }

                @Override
                public int hashCode() {
                    return System.identityHashCode(this);
                }
            };
            int top = r.nextInt((ndocs >> 3) + 1) + 1;
            final boolean luceneSort = r.nextBoolean();
            final boolean sortMissingLast = !luceneSort && r.nextBoolean();
            final boolean sortMissingFirst = !luceneSort && !sortMissingLast;
            final boolean reverse = r.nextBoolean();
            List<SortField> sfields = new ArrayList<>();
            final boolean secondary = r.nextBoolean();
            final boolean luceneSort2 = r.nextBoolean();
            final boolean sortMissingLast2 = !luceneSort2 && r.nextBoolean();
            final boolean sortMissingFirst2 = !luceneSort2 && !sortMissingLast2;
            final boolean reverse2 = r.nextBoolean();
            if (r.nextBoolean())
                sfields.add(new SortField(null, SortField.Type.SCORE));
            // hit both use-cases of sort-missing-last
            sfields.add(Sorting.getStringSortField("f", reverse, sortMissingLast, sortMissingFirst));
            if (secondary) {
                sfields.add(Sorting.getStringSortField("f2", reverse2, sortMissingLast2, sortMissingFirst2));
            }
            if (r.nextBoolean())
                sfields.add(new SortField(null, SortField.Type.SCORE));
            Sort sort = new Sort(sfields.toArray(new SortField[sfields.size()]));
            final String nullRep = luceneSort || sortMissingFirst && !reverse || sortMissingLast && reverse ? "" : "zzz";
            final String nullRep2 = luceneSort2 || sortMissingFirst2 && !reverse2 || sortMissingLast2 && reverse2 ? "" : "zzz";
            boolean trackScores = r.nextBoolean();
            boolean trackMaxScores = r.nextBoolean();
            boolean scoreInOrder = r.nextBoolean();
            final TopFieldCollector topCollector = TopFieldCollector.create(sort, top, true, trackScores, trackMaxScores);
            final List<MyDoc> collectedDocs = new ArrayList<>();
            // delegate and collect docs ourselves
            Collector myCollector = new FilterCollector(topCollector) {

                @Override
                public LeafCollector getLeafCollector(LeafReaderContext context) throws IOException {
                    final int docBase = context.docBase;
                    return new FilterLeafCollector(super.getLeafCollector(context)) {

                        @Override
                        public void collect(int doc) throws IOException {
                            super.collect(doc);
                            collectedDocs.add(mydocs[docBase + doc]);
                        }
                    };
                }
            };
            searcher.search(filt, myCollector);
            Collections.sort(collectedDocs, (o1, o2) -> {
                String v1 = o1.val == null ? nullRep : o1.val;
                String v2 = o2.val == null ? nullRep : o2.val;
                int cmp = v1.compareTo(v2);
                if (reverse)
                    cmp = -cmp;
                if (cmp != 0)
                    return cmp;
                if (secondary) {
                    v1 = o1.val2 == null ? nullRep2 : o1.val2;
                    v2 = o2.val2 == null ? nullRep2 : o2.val2;
                    cmp = v1.compareTo(v2);
                    if (reverse2)
                        cmp = -cmp;
                }
                cmp = cmp == 0 ? o1.doc - o2.doc : cmp;
                return cmp;
            });
            TopDocs topDocs = topCollector.topDocs();
            ScoreDoc[] sdocs = topDocs.scoreDocs;
            for (int j = 0; j < sdocs.length; j++) {
                int id = sdocs[j].doc;
                if (id != collectedDocs.get(j).doc) {
                    log.error("Error at pos " + j + "\n\tsortMissingFirst=" + sortMissingFirst + " sortMissingLast=" + sortMissingLast + " reverse=" + reverse + "\n\tEXPECTED=" + collectedDocs);
                }
                assertEquals(id, collectedDocs.get(j).doc);
            }
        }
        reader.close();
    }
    dir.close();
}
Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) SortField(org.apache.lucene.search.SortField) Document(org.apache.lucene.document.Document) ScoreDoc(org.apache.lucene.search.ScoreDoc) TopDocs(org.apache.lucene.search.TopDocs) StringField(org.apache.lucene.document.StringField) SchemaField(org.apache.solr.schema.SchemaField) SortField(org.apache.lucene.search.SortField) Field(org.apache.lucene.document.Field) LeafCollector(org.apache.lucene.search.LeafCollector) FilterLeafCollector(org.apache.lucene.search.FilterLeafCollector) FilterCollector(org.apache.lucene.search.FilterCollector) Collector(org.apache.lucene.search.Collector) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) LeafReaderContext(org.apache.lucene.index.LeafReaderContext) Sort(org.apache.lucene.search.Sort) TopFieldCollector(org.apache.lucene.search.TopFieldCollector) RAMDirectory(org.apache.lucene.store.RAMDirectory) Directory(org.apache.lucene.store.Directory) DirectoryReader(org.apache.lucene.index.DirectoryReader) RAMDirectory(org.apache.lucene.store.RAMDirectory) Type(org.apache.lucene.search.SortField.Type) FilterCollector(org.apache.lucene.search.FilterCollector) IndexWriter(org.apache.lucene.index.IndexWriter) StringField(org.apache.lucene.document.StringField) FilterLeafCollector(org.apache.lucene.search.FilterLeafCollector) Bits(org.apache.lucene.util.Bits) IndexWriterConfig(org.apache.lucene.index.IndexWriterConfig)

Example 9 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-solr by apache.

the class TestPerFieldAnalyzerWrapper method testPerField.

public void testPerField() throws Exception {
    String text = "Qwerty";
    Map<String, Analyzer> analyzerPerField = Collections.<String, Analyzer>singletonMap("special", new SimpleAnalyzer());
    Analyzer defaultAnalyzer = new WhitespaceAnalyzer();
    PerFieldAnalyzerWrapper analyzer = new PerFieldAnalyzerWrapper(defaultAnalyzer, analyzerPerField);
    try (TokenStream tokenStream = analyzer.tokenStream("field", text)) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();
        assertTrue(tokenStream.incrementToken());
        assertEquals("WhitespaceAnalyzer does not lowercase", "Qwerty", termAtt.toString());
        assertFalse(tokenStream.incrementToken());
        tokenStream.end();
    }
    try (TokenStream tokenStream = analyzer.tokenStream("special", text)) {
        CharTermAttribute termAtt = tokenStream.getAttribute(CharTermAttribute.class);
        tokenStream.reset();
        assertTrue(tokenStream.incrementToken());
        assertEquals("SimpleAnalyzer lowercases", "qwerty", termAtt.toString());
        assertFalse(tokenStream.incrementToken());
        tokenStream.end();
    }
    // TODO: fix this about PFAW, this is crazy
    analyzer.close();
    defaultAnalyzer.close();
    IOUtils.close(analyzerPerField.values());
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer)

Example 10 with SimpleAnalyzer

use of org.apache.lucene.analysis.core.SimpleAnalyzer in project lucene-solr by apache.

the class TestPerFieldAnalyzerWrapper method testReuseWrapped.

public void testReuseWrapped() throws Exception {
    final String text = "Qwerty";
    final Analyzer specialAnalyzer = new SimpleAnalyzer();
    final Analyzer defaultAnalyzer = new WhitespaceAnalyzer();
    TokenStream ts1, ts2, ts3, ts4;
    final PerFieldAnalyzerWrapper wrapper1 = new PerFieldAnalyzerWrapper(defaultAnalyzer, Collections.<String, Analyzer>singletonMap("special", specialAnalyzer));
    // test that the PerFieldWrapper returns the same instance as original Analyzer:
    ts1 = defaultAnalyzer.tokenStream("something", text);
    ts2 = wrapper1.tokenStream("something", text);
    assertSame(ts1, ts2);
    ts1 = specialAnalyzer.tokenStream("special", text);
    ts2 = wrapper1.tokenStream("special", text);
    assertSame(ts1, ts2);
    // Wrap with another wrapper, which does *not* extend DelegatingAnalyzerWrapper:
    final AnalyzerWrapper wrapper2 = new AnalyzerWrapper(wrapper1.getReuseStrategy()) {

        @Override
        protected Analyzer getWrappedAnalyzer(String fieldName) {
            return wrapper1;
        }

        @Override
        protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
            assertNotSame(specialAnalyzer.tokenStream("special", text), components.getTokenStream());
            TokenFilter filter = new ASCIIFoldingFilter(components.getTokenStream());
            return new TokenStreamComponents(components.getTokenizer(), filter);
        }
    };
    ts3 = wrapper2.tokenStream("special", text);
    assertNotSame(ts1, ts3);
    assertTrue(ts3 instanceof ASCIIFoldingFilter);
    // check that cache did not get corrumpted:
    ts2 = wrapper1.tokenStream("special", text);
    assertSame(ts1, ts2);
    // Wrap PerField with another PerField. In that case all TokenStreams returned must be the same:
    final PerFieldAnalyzerWrapper wrapper3 = new PerFieldAnalyzerWrapper(wrapper1, Collections.<String, Analyzer>singletonMap("moreSpecial", specialAnalyzer));
    ts1 = specialAnalyzer.tokenStream("special", text);
    ts2 = wrapper3.tokenStream("special", text);
    assertSame(ts1, ts2);
    ts3 = specialAnalyzer.tokenStream("moreSpecial", text);
    ts4 = wrapper3.tokenStream("moreSpecial", text);
    assertSame(ts3, ts4);
    assertSame(ts2, ts3);
    IOUtils.close(wrapper3, wrapper2, wrapper1, specialAnalyzer, defaultAnalyzer);
}
Also used : WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenStream(org.apache.lucene.analysis.TokenStream) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) AnalyzerWrapper(org.apache.lucene.analysis.AnalyzerWrapper) Analyzer(org.apache.lucene.analysis.Analyzer) SimpleAnalyzer(org.apache.lucene.analysis.core.SimpleAnalyzer) WhitespaceAnalyzer(org.apache.lucene.analysis.core.WhitespaceAnalyzer) TokenFilter(org.apache.lucene.analysis.TokenFilter)

Aggregations

SimpleAnalyzer (org.apache.lucene.analysis.core.SimpleAnalyzer)10 Document (org.apache.lucene.document.Document)5 Field (org.apache.lucene.document.Field)5 Analyzer (org.apache.lucene.analysis.Analyzer)4 TextField (org.apache.lucene.document.TextField)4 IndexWriter (org.apache.lucene.index.IndexWriter)4 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)4 RAMDirectory (org.apache.lucene.store.RAMDirectory)4 Test (org.junit.Test)4 HashMap (java.util.HashMap)3 WhitespaceAnalyzer (org.apache.lucene.analysis.core.WhitespaceAnalyzer)3 IndexSearcher (org.apache.lucene.search.IndexSearcher)3 SKOSAnalyzer (at.ac.univie.mminf.luceneSKOS.analysis.SKOSAnalyzer)2 TokenStream (org.apache.lucene.analysis.TokenStream)2 PerFieldAnalyzerWrapper (org.apache.lucene.analysis.miscellaneous.PerFieldAnalyzerWrapper)2 Term (org.apache.lucene.index.Term)2 Query (org.apache.lucene.search.Query)2 TopDocs (org.apache.lucene.search.TopDocs)2 ScoredDocuments (io.anserini.rerank.ScoredDocuments)1 File (java.io.File)1