Examples with CharTermAttribute - org.apache.lucene.analysis.tokenattributes.CharTermAttribute

Example 61 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method testBigLookahead.

public void testBigLookahead() throws Exception {
    StringBuilder b = new StringBuilder();
    for (int i = 0; i < 100; i++) {
        b.append('a');
    }
    b.append('b');
    Tokenizer t = new SimplePatternSplitTokenizer(b.toString());
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    b = new StringBuilder();
    for (int i = 0; i < 200; i++) {
        b.append('a');
    }
    t.setReader(new StringReader(b.toString()));
    t.reset();
    assertTrue(t.incrementToken());
    assertEquals(b.toString(), termAtt.toString());
    assertFalse(t.incrementToken());
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 62 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method testEndOffset.

public void testEndOffset() throws Exception {
    Tokenizer t = new SimplePatternSplitTokenizer("a+");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = t.getAttribute(OffsetAttribute.class);
    t.setReader(new StringReader("aaabbb"));
    t.reset();
    assertTrue(t.incrementToken());
    assertEquals("bbb", termAtt.toString());
    assertFalse(t.incrementToken());
    t.end();
    assertEquals(6, offsetAtt.endOffset());
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 63 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method tsToString.

/** 
   * TODO: rewrite tests not to use string comparison.
   */
private static String tsToString(TokenStream in) throws IOException {
    StringBuilder out = new StringBuilder();
    CharTermAttribute termAtt = in.addAttribute(CharTermAttribute.class);
    // extra safety to enforce, that the state is not preserved and also
    // assign bogus values
    in.clearAttributes();
    termAtt.setEmpty().append("bogusTerm");
    in.reset();
    while (in.incrementToken()) {
        if (out.length() > 0) {
            out.append(' ');
        }
        out.append(termAtt.toString());
        in.clearAttributes();
        termAtt.setEmpty().append("bogusTerm");
    }
    in.close();
    return out.toString();
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 64 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method testSplitMultiCharWhitespace.

public void testSplitMultiCharWhitespace() throws Exception {
    Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.setReader(new StringReader("a \tb   c"));
    assertTokenStreamContents(t, new String[] { "a", "b", "c" }, new int[] { 0, 3, 7 }, new int[] { 1, 4, 8 });
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Example 65 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSimplePatternSplitTokenizer method testLeadingNonToken.

public void testLeadingNonToken() throws Exception {
    Tokenizer t = new SimplePatternSplitTokenizer("[ \t\r\n]*");
    CharTermAttribute termAtt = t.getAttribute(CharTermAttribute.class);
    t.setReader(new StringReader("    a c"));
    assertTokenStreamContents(t, new String[] { "a", "c" }, new int[] { 4, 6 }, new int[] { 5, 7 });
}

Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StringReader(java.io.StringReader) Tokenizer(org.apache.lucene.analysis.Tokenizer)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6