Search in sources :

Example 76 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestTermAutomatonQuery method testAnyFromTokenStream.

public void testAnyFromTokenStream() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "here comes the sun", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newTextField("field", "here comes the moon", Field.Store.NO));
    w.addDocument(doc);
    doc = new Document();
    doc.add(newTextField("field", "here comes sun", Field.Store.NO));
    w.addDocument(doc);
    // Should not match:
    doc = new Document();
    doc.add(newTextField("field", "here comes the other sun", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    TokenStream ts = new CannedTokenStream(new Token[] { token("comes", 1, 1), token("comes", 0, 2), token("*", 1, 1), token("sun", 1, 1), token("moon", 0, 1) });
    TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts);
    // System.out.println("DOT: " + q.toDot());
    assertEquals(3, s.search(q, 1).totalHits);
    w.close();
    r.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 77 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestTermAutomatonQuery method testTermDoesNotExist.

public void testTermDoesNotExist() throws Exception {
    Directory dir = newDirectory();
    RandomIndexWriter w = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    doc.add(newTextField("field", "x y z", Field.Store.NO));
    w.addDocument(doc);
    IndexReader r = w.getReader();
    IndexSearcher s = newSearcher(r);
    TokenStream ts = new CannedTokenStream(new Token[] { token("a", 1, 1) });
    TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts);
    // System.out.println("DOT: " + q.toDot());
    assertEquals(0, s.search(q, 1).totalHits);
    w.close();
    r.close();
    dir.close();
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) IndexReader(org.apache.lucene.index.IndexReader) Document(org.apache.lucene.document.Document) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) Directory(org.apache.lucene.store.Directory)

Example 78 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class ICUCollationField method getCollationKey.

/**
   * analyze the text with the analyzer, instead of the collator.
   * because icu collators are not thread safe, this keeps things 
   * simple (we already have a threadlocal clone in the reused TS)
   */
private BytesRef getCollationKey(String field, String text) {
    try (TokenStream source = analyzer.tokenStream(field, text)) {
        source.reset();
        TermToBytesRefAttribute termAtt = source.getAttribute(TermToBytesRefAttribute.class);
        // we control the analyzer here: most errors are impossible
        if (!source.incrementToken())
            throw new IllegalArgumentException("analyzer returned no terms for text: " + text);
        BytesRef bytes = BytesRef.deepCopyOf(termAtt.getBytesRef());
        assert !source.incrementToken();
        source.end();
        return bytes;
    } catch (IOException e) {
        throw new RuntimeException("Unable to analyze text: " + text, e);
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) TermToBytesRefAttribute(org.apache.lucene.analysis.tokenattributes.TermToBytesRefAttribute) IOException(java.io.IOException) BytesRef(org.apache.lucene.util.BytesRef)

Example 79 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class SpellingQueryConverter method analyze.

protected void analyze(Collection<Token> result, String text, int offset, int flagsAttValue) throws IOException {
    TokenStream stream = analyzer.tokenStream("", text);
    // TODO: support custom attributes
    CharTermAttribute termAtt = stream.addAttribute(CharTermAttribute.class);
    TypeAttribute typeAtt = stream.addAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = stream.addAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = stream.addAttribute(PositionIncrementAttribute.class);
    OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        Token token = new Token();
        token.copyBuffer(termAtt.buffer(), 0, termAtt.length());
        token.setOffset(offset + offsetAtt.startOffset(), offset + offsetAtt.endOffset());
        //overwriting any flags already set...
        token.setFlags(flagsAttValue);
        token.setType(typeAtt.type());
        token.setPayload(payloadAtt.getPayload());
        token.setPositionIncrement(posIncAtt.getPositionIncrement());
        result.add(token);
    }
    stream.end();
    stream.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) Token(org.apache.lucene.analysis.Token) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 80 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testEmpty.

public void testEmpty() throws Exception {
    TokenStream ts = new CannedTokenStream();
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertFalse(it.hasNext());
    assertArrayEquals(new int[0], graph.articulationPoints());
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)849 StringReader (java.io.StringReader)337 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)141 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)86 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 ArrayList (java.util.ArrayList)59 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)47