Search in sources :

Example 41 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class PreAnalyzedFieldTest method testInvalidJson.

public void testInvalidJson() throws Exception {
    PreAnalyzedField paf = new PreAnalyzedField();
    paf.init(h.getCore().getLatestSchema(), Collections.emptyMap());
    Analyzer preAnalyzer = paf.getIndexAnalyzer();
    for (String s : invalidJson) {
        TokenStream stream = null;
        try {
            stream = preAnalyzer.tokenStream("dummy", s);
            // exception should be triggered here.
            stream.reset();
            fail("should fail: '" + s + "'");
        } catch (Exception e) {
        // expected
        } finally {
            if (stream != null) {
                stream.close();
            }
        }
    }
    // make sure the analyzer can now handle properly formatted input
    TokenStream stream = preAnalyzer.tokenStream("dummy", validJson);
    CharTermAttribute termAttr = stream.addAttribute(CharTermAttribute.class);
    stream.reset();
    while (stream.incrementToken()) {
        assertFalse("zero-length token", termAttr.length() == 0);
    }
    stream.end();
    stream.close();
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) Analyzer(org.apache.lucene.analysis.Analyzer)

Example 42 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestStopAnalyzer method testStopList.

public void testStopList() throws IOException {
    CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
    StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
    try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
        assertNotNull(stream);
        CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
        stream.reset();
        while (stream.incrementToken()) {
            String text = termAtt.toString();
            assertFalse(stopWordsSet.contains(text));
        }
        stream.end();
    }
    newStop.close();
}
Also used : CharArraySet(org.apache.lucene.analysis.CharArraySet) TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 43 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestTypeTokenFilter method testPositons.

private void testPositons(TypeTokenFilter stpf) throws IOException {
    TypeAttribute typeAtt = stpf.getAttribute(TypeAttribute.class);
    CharTermAttribute termAttribute = stpf.getAttribute(CharTermAttribute.class);
    PositionIncrementAttribute posIncrAtt = stpf.getAttribute(PositionIncrementAttribute.class);
    stpf.reset();
    while (stpf.incrementToken()) {
        log("Token: " + termAttribute.toString() + ": " + typeAtt.type() + " - " + posIncrAtt.getPositionIncrement());
        assertEquals("if position increment is enabled the positionIncrementAttribute value should be 3, otherwise 1", posIncrAtt.getPositionIncrement(), 3);
    }
    stpf.end();
    stpf.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 44 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestTeeSinkTokenFilter method performance.

/**
   * Not an explicit test, just useful to print out some info on performance
   */
@SuppressWarnings("resource")
public void performance() throws Exception {
    int[] tokCount = { 100, 500, 1000, 2000, 5000, 10000 };
    int[] modCounts = { 1, 2, 5, 10, 20, 50, 100, 200, 500 };
    for (int k = 0; k < tokCount.length; k++) {
        StringBuilder buffer = new StringBuilder();
        System.out.println("-----Tokens: " + tokCount[k] + "-----");
        for (int i = 0; i < tokCount[k]; i++) {
            buffer.append(English.intToEnglish(i).toUpperCase(Locale.ROOT)).append(' ');
        }
        //make sure we produce the same tokens
        TeeSinkTokenFilter teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
        TokenStream sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), 100);
        teeStream.consumeAllTokens();
        TokenStream stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), 100);
        CharTermAttribute tfTok = stream.addAttribute(CharTermAttribute.class);
        CharTermAttribute sinkTok = sink.addAttribute(CharTermAttribute.class);
        for (int i = 0; stream.incrementToken(); i++) {
            assertTrue(sink.incrementToken());
            assertTrue(tfTok + " is not equal to " + sinkTok + " at token: " + i, tfTok.equals(sinkTok) == true);
        }
        //simulate two fields, each being analyzed once, for 20 documents
        for (int j = 0; j < modCounts.length; j++) {
            int tfPos = 0;
            long start = System.currentTimeMillis();
            for (int i = 0; i < 20; i++) {
                stream = new StandardFilter(standardTokenizer(buffer));
                PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
                while (stream.incrementToken()) {
                    tfPos += posIncrAtt.getPositionIncrement();
                }
                stream = new ModuloTokenFilter(new StandardFilter(standardTokenizer(buffer)), modCounts[j]);
                posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class);
                while (stream.incrementToken()) {
                    tfPos += posIncrAtt.getPositionIncrement();
                }
            }
            long finish = System.currentTimeMillis();
            System.out.println("ModCount: " + modCounts[j] + " Two fields took " + (finish - start) + " ms");
            int sinkPos = 0;
            //simulate one field with one sink
            start = System.currentTimeMillis();
            for (int i = 0; i < 20; i++) {
                teeStream = new TeeSinkTokenFilter(new StandardFilter(standardTokenizer(buffer)));
                sink = new ModuloTokenFilter(teeStream.newSinkTokenStream(), modCounts[j]);
                PositionIncrementAttribute posIncrAtt = teeStream.getAttribute(PositionIncrementAttribute.class);
                while (teeStream.incrementToken()) {
                    sinkPos += posIncrAtt.getPositionIncrement();
                }
                //System.out.println("Modulo--------");
                posIncrAtt = sink.getAttribute(PositionIncrementAttribute.class);
                while (sink.incrementToken()) {
                    sinkPos += posIncrAtt.getPositionIncrement();
                }
            }
            finish = System.currentTimeMillis();
            System.out.println("ModCount: " + modCounts[j] + " Tee fields took " + (finish - start) + " ms");
            assertTrue(sinkPos + " does not equal: " + tfPos, sinkPos == tfPos);
        }
        System.out.println("- End Tokens: " + tokCount[k] + "-----");
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) StandardFilter(org.apache.lucene.analysis.standard.StandardFilter) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Example 45 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestSnowball method testFilterTokens.

public void testFilterTokens() throws Exception {
    SnowballFilter filter = new SnowballFilter(new TestTokenStream(), "English");
    CharTermAttribute termAtt = filter.getAttribute(CharTermAttribute.class);
    OffsetAttribute offsetAtt = filter.getAttribute(OffsetAttribute.class);
    TypeAttribute typeAtt = filter.getAttribute(TypeAttribute.class);
    PayloadAttribute payloadAtt = filter.getAttribute(PayloadAttribute.class);
    PositionIncrementAttribute posIncAtt = filter.getAttribute(PositionIncrementAttribute.class);
    FlagsAttribute flagsAtt = filter.getAttribute(FlagsAttribute.class);
    filter.incrementToken();
    assertEquals("accent", termAtt.toString());
    assertEquals(2, offsetAtt.startOffset());
    assertEquals(7, offsetAtt.endOffset());
    assertEquals("wrd", typeAtt.type());
    assertEquals(3, posIncAtt.getPositionIncrement());
    assertEquals(77, flagsAtt.getFlags());
    assertEquals(new BytesRef(new byte[] { 0, 1, 2, 3 }), payloadAtt.getPayload());
}
Also used : PayloadAttribute(org.apache.lucene.analysis.tokenattributes.PayloadAttribute) FlagsAttribute(org.apache.lucene.analysis.tokenattributes.FlagsAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) TypeAttribute(org.apache.lucene.analysis.tokenattributes.TypeAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) BytesRef(org.apache.lucene.util.BytesRef) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)151 TokenStream (org.apache.lucene.analysis.TokenStream)95 StringReader (java.io.StringReader)46 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)35 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)34 IOException (java.io.IOException)27 ArrayList (java.util.ArrayList)27 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 Analyzer (org.apache.lucene.analysis.Analyzer)20 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)9 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6