Examples with Analyzer - org.apache.lucene.analysis.Analyzer

Example 86 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestCapitalizationFilter method testRandomString.

/** blast some random strings through the analyzer */
public void testRandomString() throws Exception {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
            return new TokenStreamComponents(tokenizer, new CapitalizationFilter(tokenizer));
        }
    };
    checkRandomData(random(), a, 1000 * RANDOM_MULTIPLIER);
    a.close();
}

Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) KeywordTokenizer(org.apache.lucene.analysis.core.KeywordTokenizer) CapitalizationFilter(org.apache.lucene.analysis.miscellaneous.CapitalizationFilter)

Example 87 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestSynonymGraphFilter method testRandomSyns.

public void testRandomSyns() throws Exception {
    int synCount = atLeast(10);
    double bias = random().nextDouble();
    boolean dedup = random().nextBoolean();
    boolean flatten = random().nextBoolean();
    SynonymMap.Builder b = new SynonymMap.Builder(dedup);
    List<OneSyn> syns = new ArrayList<>();
    // Makes random syns from random a / b tokens, mapping to random x / y tokens
    if (VERBOSE) {
        System.out.println("TEST: make " + synCount + " syns");
        System.out.println("  bias for a over b=" + bias);
        System.out.println("  dedup=" + dedup);
        System.out.println("  flatten=" + flatten);
    }
    int maxSynLength = 0;
    for (int i = 0; i < synCount; i++) {
        OneSyn syn = new OneSyn();
        syn.in = randomBinaryChars(1, 5, bias, 'a');
        syn.out = randomBinaryChars(1, 5, 0.5, 'x');
        syn.keepOrig = random().nextBoolean();
        syns.add(syn);
        maxSynLength = Math.max(maxSynLength, syn.in.length);
        if (VERBOSE) {
            System.out.println("  " + syn);
        }
        add(b, toTokenString(syn.in), toTokenString(syn.out), syn.keepOrig);
    }
    // Compute max allowed lookahead for flatten filter:
    int maxFlattenLookahead = 0;
    if (flatten) {
        for (int i = 0; i < synCount; i++) {
            OneSyn syn1 = syns.get(i);
            int count = syn1.out.length;
            boolean keepOrig = syn1.keepOrig;
            for (int j = 0; j < synCount; j++) {
                OneSyn syn2 = syns.get(i);
                keepOrig |= syn2.keepOrig;
                if (syn1.in.equals(syn2.in)) {
                    count += syn2.out.length;
                }
            }
            if (keepOrig) {
                count += syn1.in.length;
            }
            maxFlattenLookahead = Math.max(maxFlattenLookahead, count);
        }
    }
    // Only used w/ VERBOSE:
    Analyzer aNoFlattened;
    if (VERBOSE) {
        aNoFlattened = getAnalyzer(b, true);
    } else {
        aNoFlattened = null;
    }
    Analyzer a;
    if (flatten) {
        a = getFlattenAnalyzer(b, true);
    } else {
        a = getAnalyzer(b, true);
    }
    int iters = atLeast(20);
    for (int iter = 0; iter < iters; iter++) {
        String doc = toTokenString(randomBinaryChars(50, 100, bias, 'a'));
        if (VERBOSE) {
            System.out.println("TEST: iter=" + iter + " doc=" + doc);
        }
        Automaton expected = slowSynFilter(doc, syns, flatten);
        if (VERBOSE) {
            System.out.println("  expected:\n" + expected.toDot());
            if (flatten) {
                Automaton unflattened = toAutomaton(aNoFlattened.tokenStream("field", new StringReader(doc)));
                System.out.println("  actual unflattened:\n" + unflattened.toDot());
            }
        }
        Automaton actual = toAutomaton(a.tokenStream("field", new StringReader(doc)));
        if (VERBOSE) {
            System.out.println("  actual:\n" + actual.toDot());
        }
        assertTrue("maxLookaheadUsed=" + synFilter.getMaxLookaheadUsed() + " maxSynLength=" + maxSynLength, synFilter.getMaxLookaheadUsed() <= maxSynLength);
        if (flatten) {
            assertTrue("flatten maxLookaheadUsed=" + flattenFilter.getMaxLookaheadUsed() + " maxFlattenLookahead=" + maxFlattenLookahead, flattenFilter.getMaxLookaheadUsed() <= maxFlattenLookahead);
        }
        checkAnalysisConsistency(random(), a, random().nextBoolean(), doc);
        // output token that also happens to be in the input:
        try {
            actual = Operations.determinize(actual, 50000);
        } catch (TooComplexToDeterminizeException tctde) {
            // Unfortunately the syns can easily create difficult-to-determinize graphs:
            assertTrue(approxEquals(actual, expected));
            continue;
        }
        try {
            expected = Operations.determinize(expected, 50000);
        } catch (TooComplexToDeterminizeException tctde) {
            // Unfortunately the syns can easily create difficult-to-determinize graphs:
            assertTrue(approxEquals(actual, expected));
            continue;
        }
        assertTrue(approxEquals(actual, expected));
        assertTrue(Operations.sameLanguage(actual, expected));
    }
    a.close();
}

Also used : TooComplexToDeterminizeException(org.apache.lucene.util.automaton.TooComplexToDeterminizeException) Automaton(org.apache.lucene.util.automaton.Automaton) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) ArrayList(java.util.ArrayList) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringReader(java.io.StringReader)

Example 88 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestSynonymGraphFilter method solrSynsToAnalyzer.

private Analyzer solrSynsToAnalyzer(String syns) throws IOException, ParseException {
    Analyzer analyzer = new MockAnalyzer(random());
    SolrSynonymParser parser = new SolrSynonymParser(true, true, analyzer);
    parser.parse(new StringReader(syns));
    analyzer.close();
    return getFlattenAnalyzer(parser, true);
}

Also used : MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) StringReader(java.io.StringReader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer)

Example 89 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestSynonymGraphFilter method testBasic2.

public void testBasic2() throws Exception {
    boolean keepOrig = true;
    do {
        keepOrig = !keepOrig;
        SynonymMap.Builder b = new SynonymMap.Builder(true);
        add(b, "aaa", "aaaa1 aaaa2 aaaa3", keepOrig);
        add(b, "bbb", "bbbb1 bbbb2", keepOrig);
        Analyzer a = getFlattenAnalyzer(b, true);
        if (keepOrig) {
            assertAnalyzesTo(a, "xyzzy bbb pot of gold", new String[] { "xyzzy", "bbbb1", "bbb", "bbbb2", "pot", "of", "gold" }, new int[] { 1, 1, 0, 1, 1, 1, 1 });
            assertAnalyzesTo(a, "xyzzy aaa pot of gold", new String[] { "xyzzy", "aaaa1", "aaa", "aaaa2", "aaaa2", "pot", "of", "gold" }, new int[] { 1, 1, 0, 1, 1, 1, 1, 1 });
        } else {
            assertAnalyzesTo(a, "xyzzy bbb pot of gold", new String[] { "xyzzy", "bbbb1", "bbbb2", "pot", "of", "gold" }, new int[] { 1, 1, 1, 1, 1, 1 });
            assertAnalyzesTo(a, "xyzzy aaa pot of gold", new String[] { "xyzzy", "aaaa1", "aaaa2", "aaaa3", "pot", "of", "gold" }, new int[] { 1, 1, 1, 1, 1, 1, 1 });
        }
    } while (keepOrig);
}

Also used : CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer)

Example 90 with Analyzer

use of org.apache.lucene.analysis.Analyzer in project lucene-solr by apache.

the class TestSynonymGraphFilter method testFlattenedGraph.

/** If we expand synonyms during indexing, it's a bit better than
   *  SynonymFilter is today, but still necessarily has false
   *  positive and negative PhraseQuery matches because we do not  
   *  index posLength, so we lose information. */
public void testFlattenedGraph() throws Exception {
    SynonymMap.Builder b = new SynonymMap.Builder();
    add(b, "wtf", "what the fudge", true);
    Analyzer a = getFlattenAnalyzer(b, true);
    assertAnalyzesTo(a, "wtf happened", new String[] { "what", "wtf", "the", "fudge", "happened" }, new int[] { 0, 0, 0, 0, 4 }, new int[] { 3, 3, 3, 3, 12 }, null, new int[] { 1, 0, 1, 1, 1 }, new int[] { 1, 3, 1, 1, 1 }, true);
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir, a);
    Document doc = new Document();
    doc.add(newTextField("field", "wtf happened", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    w.close();
    IndexSearcher s = newSearcher(r);
    // Good (this should not match, and doesn't):
    assertEquals(0, s.count(new PhraseQuery("field", "what", "happened")));
    // Bad (this should match, but doesn't):
    assertEquals(0, s.count(new PhraseQuery("field", "wtf", "happened")));
    // Good (this should match, and does):
    assertEquals(1, s.count(new PhraseQuery("field", "what", "the", "fudge", "happened")));
    // Bad (this should not match, but does):
    assertEquals(1, s.count(new PhraseQuery("field", "wtf", "the")));
    IOUtils.close(r, dir);
}

Also used : IndexSearcher(org.apache.lucene.search.IndexSearcher) PhraseQuery(org.apache.lucene.search.PhraseQuery) CharsRefBuilder(org.apache.lucene.util.CharsRefBuilder) IntsRefBuilder(org.apache.lucene.util.IntsRefBuilder) IndexReader(org.apache.lucene.index.IndexReader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Document(org.apache.lucene.document.Document) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Aggregations

Analyzer (org.apache.lucene.analysis.Analyzer)1020 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)396 Tokenizer (org.apache.lucene.analysis.Tokenizer)265 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)228 Document (org.apache.lucene.document.Document)207 Directory (org.apache.lucene.store.Directory)192 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)176 BytesRef (org.apache.lucene.util.BytesRef)122 Test (org.junit.Test)119 TokenStream (org.apache.lucene.analysis.TokenStream)107 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)92 Term (org.apache.lucene.index.Term)92 IndexReader (org.apache.lucene.index.IndexReader)67 InputArrayIterator (org.apache.lucene.search.suggest.InputArrayIterator)65 StandardAnalyzer (org.apache.lucene.analysis.standard.StandardAnalyzer)64 Input (org.apache.lucene.search.suggest.Input)63 CharArraySet (org.apache.lucene.analysis.CharArraySet)58 ArrayList (java.util.ArrayList)57 IndexWriterConfig (org.apache.lucene.index.IndexWriterConfig)57 TextField (org.apache.lucene.document.TextField)55