Search in sources :

Example 21 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr-analysis-turkish by iorixxx.

the class TestTurkishDeASCIIfyFilter method testDeAscii2.

public void testDeAscii2() throws Exception {
    TokenStream stream = whitespaceMockTokenizer("tatlises akgunduz sakip cernobil baslattigi dayanikliklarini");
    stream = new TurkishDeASCIIfyFilter(stream, false);
    assertTokenStreamContents(stream, new String[] { "tatlıses", "akgündüz", "sakıp", "çernobil", "başlattığı", "dayanıklıklarını" });
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TurkishDeASCIIfyFilter(org.apache.lucene.analysis.tr.TurkishDeASCIIfyFilter)

Example 22 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project languagetool by languagetool-org.

the class LanguageToolAnalyzer method createComponents.

@Override
protected TokenStreamComponents createComponents(String s) {
    Tokenizer tokenizer = new AnyCharTokenizer();
    TokenStream result = new LanguageToolFilter(tokenizer, languageTool, toLowerCase);
    return new TokenStreamComponents(tokenizer, result);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 23 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project che by eclipse.

the class LuceneSearcher method makeAnalyzer.

protected Analyzer makeAnalyzer() {
    return new Analyzer() {

        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new WhitespaceTokenizer();
            TokenStream filter = new LowerCaseFilter(tokenizer);
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}
Also used : WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) TokenStream(org.apache.lucene.analysis.TokenStream) Analyzer(org.apache.lucene.analysis.Analyzer) Tokenizer(org.apache.lucene.analysis.Tokenizer) WhitespaceTokenizer(org.apache.lucene.analysis.core.WhitespaceTokenizer) LowerCaseFilter(org.apache.lucene.analysis.core.LowerCaseFilter)

Example 24 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project textdb by TextDB.

the class DataflowUtils method tokenizeQuery.

/**
     * Tokenizes the query string using the given analyser
     * 
     * @param luceneAnalyzer
     * @param query
     * @return ArrayList<String> list of results
     */
public static ArrayList<String> tokenizeQuery(Analyzer luceneAnalyzer, String query) {
    ArrayList<String> result = new ArrayList<String>();
    TokenStream tokenStream = luceneAnalyzer.tokenStream(null, new StringReader(query));
    CharTermAttribute term = tokenStream.addAttribute(CharTermAttribute.class);
    try {
        tokenStream.reset();
        while (tokenStream.incrementToken()) {
            result.add(term.toString());
        }
        tokenStream.close();
    } catch (Exception e) {
        e.printStackTrace();
    }
    return result;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList) StringReader(java.io.StringReader) DataFlowException(edu.uci.ics.textdb.api.exception.DataFlowException) IOException(java.io.IOException)

Example 25 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestSuggestField method testTokenStream.

@Test
public void testTokenStream() throws Exception {
    Analyzer analyzer = new MockAnalyzer(random());
    SuggestField suggestField = new SuggestField("field", "input", 1);
    BytesRef surfaceForm = new BytesRef("input");
    ByteArrayOutputStream byteArrayOutputStream = new ByteArrayOutputStream();
    try (OutputStreamDataOutput output = new OutputStreamDataOutput(byteArrayOutputStream)) {
        output.writeVInt(surfaceForm.length);
        output.writeBytes(surfaceForm.bytes, surfaceForm.offset, surfaceForm.length);
        output.writeVInt(1 + 1);
        output.writeByte(SuggestField.TYPE);
    }
    BytesRef payload = new BytesRef(byteArrayOutputStream.toByteArray());
    TokenStream stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(analyzer, null));
    assertTokenStreamContents(stream, new String[] { "input" }, null, null, new String[] { payload.utf8ToString() }, new int[] { 1 }, null, null);
    CompletionAnalyzer completionAnalyzer = new CompletionAnalyzer(analyzer);
    stream = new CompletionTokenStreamTest.PayloadAttrToTypeAttrFilter(suggestField.tokenStream(completionAnalyzer, null));
    assertTokenStreamContents(stream, new String[] { "input" }, null, null, new String[] { payload.utf8ToString() }, new int[] { 1 }, null, null);
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) OutputStreamDataOutput(org.apache.lucene.store.OutputStreamDataOutput) ByteArrayOutputStream(java.io.ByteArrayOutputStream) Analyzer(org.apache.lucene.analysis.Analyzer) MockAnalyzer(org.apache.lucene.analysis.MockAnalyzer) BytesRef(org.apache.lucene.util.BytesRef) Test(org.junit.Test)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)848 StringReader (java.io.StringReader)336 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)140 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)85 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 ArrayList (java.util.ArrayList)58 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)46