Search in sources :

Example 36 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project sukija by ahomansikka.

the class AppTest method test.

private boolean test(String input, String expectedOutput) throws IOException {
    Reader r = new StringReader(input);
    TokenStream t = new HVTokenizer();
    ((Tokenizer) t).setReader(r);
    t = new VoikkoFilter(t, voikko);
    t.reset();
    VoikkoAttribute sukijaAtt = t.addAttribute(VoikkoAttribute.class);
    CharTermAttribute termAtt = t.addAttribute(CharTermAttribute.class);
    while (t.incrementToken()) {
        System.out.println("AppTest " + termAtt.toString());
        for (int i = 0; i < sukijaAtt.getAnalysis().size(); i++) {
            System.out.println(sukijaAtt.getAnalysis(i).get("BASEFORM"));
        //        VoikkoUtils.printAnalysisResult (sukijaAtt.getAnalysis(i), System.out);
        }
        System.out.println("");
    }
    return true;
}
Also used : HVTokenizer(peltomaa.sukija.finnish.HVTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) VoikkoFilter(peltomaa.sukija.voikko.VoikkoFilter) StringReader(java.io.StringReader) Reader(java.io.Reader) StringReader(java.io.StringReader) VoikkoAttribute(peltomaa.sukija.attributes.VoikkoAttribute) Tokenizer(org.apache.lucene.analysis.Tokenizer) HVTokenizer(peltomaa.sukija.finnish.HVTokenizer)

Example 37 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class AnalyzingSuggesterTest method testDupSurfaceFormsMissingResults2.

public void testDupSurfaceFormsMissingResults2() throws Exception {
    Analyzer a = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                int count;

                @Override
                public TokenStream getTokenStream() {
                    if (count == 0) {
                        count++;
                        return new CannedTokenStream(new Token[] { token("p", 1, 1), token("q", 1, 1), token("r", 0, 1), token("s", 0, 1) });
                    } else {
                        return new CannedTokenStream(new Token[] { token("p", 1, 1) });
                    }
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Directory tempDir = getDirectory();
    AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, -1, true);
    suggester.build(new InputArrayIterator(new Input[] { new Input("a", 6), new Input("b", 5) }));
    List<LookupResult> results = suggester.lookup("a", false, 2);
    assertEquals(2, results.size());
    assertEquals("a", results.get(0).key);
    assertEquals(6, results.get(0).value);
    assertEquals("b", results.get(1).key);
    assertEquals(5, results.get(1).value);
    // Try again after save/load:
    Path tmpDir = createTempDir("AnalyzingSuggesterTest");
    Path path = tmpDir.resolve("suggester");
    OutputStream os = Files.newOutputStream(path);
    suggester.store(os);
    os.close();
    InputStream is = Files.newInputStream(path);
    suggester.load(is);
    is.close();
    results = suggester.lookup("a", false, 2);
    assertEquals(2, results.size());
    assertEquals("a", results.get(0).key);
    assertEquals(6, results.get(0).value);
    assertEquals("b", results.get(1).key);
    assertEquals(5, results.get(1).value);
    IOUtils.close(a, tempDir);
}
Also used : Path(java.nio.file.Path) InputStream(java.io.InputStream) OutputStream(java.io.OutputStream) Reader(java.io.Reader) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 38 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class FuzzySuggesterTest method testInputPathRequired.

public void testInputPathRequired() throws Exception {
    //  SynonymMap.Builder b = new SynonymMap.Builder(false);
    //  b.add(new CharsRef("ab"), new CharsRef("ba"), true);
    //  final SynonymMap map = b.build();
    //  The Analyzer below mimics the functionality of the SynonymAnalyzer
    //  using the above map, so that the suggest module does not need a dependency on the 
    //  synonym module 
    final Analyzer analyzer = new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new MockTokenizer(MockTokenizer.SIMPLE, true);
            return new TokenStreamComponents(tokenizer) {

                int tokenStreamCounter = 0;

                final TokenStream[] tokenStreams = new TokenStream[] { new CannedTokenStream(new Token[] { token("ab", 1, 1), token("ba", 0, 1), token("xc", 1, 1) }), new CannedTokenStream(new Token[] { token("ba", 1, 1), token("xd", 1, 1) }), new CannedTokenStream(new Token[] { token("ab", 1, 1), token("ba", 0, 1), token("x", 1, 1) }) };

                @Override
                public TokenStream getTokenStream() {
                    TokenStream result = tokenStreams[tokenStreamCounter];
                    tokenStreamCounter++;
                    return result;
                }

                @Override
                protected void setReader(final Reader reader) {
                }
            };
        }
    };
    Input[] keys = new Input[] { new Input("ab xc", 50), new Input("ba xd", 50) };
    Directory tempDir = getDirectory();
    FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer);
    suggester.build(new InputArrayIterator(keys));
    List<LookupResult> results = suggester.lookup("ab x", false, 1);
    assertTrue(results.size() == 1);
    IOUtils.close(analyzer, tempDir);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Reader(java.io.Reader) Token(org.apache.lucene.analysis.Token) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) LookupResult(org.apache.lucene.search.suggest.Lookup.LookupResult) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer) Directory(org.apache.lucene.store.Directory)

Example 39 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestFreeTextSuggester method testEndingHole.

// With one ending hole, ShingleFilter produces "of _" and
// we should properly predict from that:
public void testEndingHole() throws Exception {
    // Just deletes "of"
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer();
            CharArraySet stopSet = StopFilter.makeStopSet("of");
            return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
        }
    };
    Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of oz", 50));
    FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
    sug.build(new InputArrayIterator(keys));
    assertEquals("wizard _ oz/1.00", toString(sug.lookup("wizard of", 10)));
    // Falls back to unigram model, with backoff 0.4 times
    // prop 0.5:
    assertEquals("oz/0.20", toString(sug.lookup("wizard o", 10)));
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) StopFilter(org.apache.lucene.analysis.StopFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Example 40 with Tokenizer

use of org.apache.lucene.analysis.Tokenizer in project lucene-solr by apache.

the class TestFreeTextSuggester method testTwoEndingHoles.

// If the number of ending holes exceeds the ngrams window
// then there are no predictions, because ShingleFilter
// does not produce e.g. a hole only "_ _" token:
public void testTwoEndingHoles() throws Exception {
    // Just deletes "of"
    Analyzer a = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String field) {
            Tokenizer tokenizer = new MockTokenizer();
            CharArraySet stopSet = StopFilter.makeStopSet("of");
            return new TokenStreamComponents(tokenizer, new StopFilter(tokenizer, stopSet));
        }
    };
    Iterable<Input> keys = AnalyzingSuggesterTest.shuffle(new Input("wizard of of oz", 50));
    FreeTextSuggester sug = new FreeTextSuggester(a, a, 3, (byte) 0x20);
    sug.build(new InputArrayIterator(keys));
    assertEquals("", toString(sug.lookup("wizard of of", 10)));
    a.close();
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CharArraySet(org.apache.lucene.analysis.CharArraySet) Input(org.apache.lucene.search.suggest.Input) InputArrayIterator(org.apache.lucene.search.suggest.InputArrayIterator) StopFilter(org.apache.lucene.analysis.StopFilter) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) MockTokenizer(org.apache.lucene.analysis.MockTokenizer)

Aggregations

Tokenizer (org.apache.lucene.analysis.Tokenizer)611 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)288 Analyzer (org.apache.lucene.analysis.Analyzer)269 StringReader (java.io.StringReader)264 TokenStream (org.apache.lucene.analysis.TokenStream)245 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)216 Reader (java.io.Reader)91 WhitespaceTokenizer (org.apache.lucene.analysis.core.WhitespaceTokenizer)77 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)73 StopFilter (org.apache.lucene.analysis.StopFilter)56 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)55 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)51 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)47 CharArraySet (org.apache.lucene.analysis.CharArraySet)44 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)37 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)35 ESTestCase (org.elasticsearch.test.ESTestCase)30 HashMap (java.util.HashMap)24 TokenFilter (org.apache.lucene.analysis.TokenFilter)24 Random (java.util.Random)20