Search in sources :

Example 31 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestJapaneseTokenizer method testSurrogates2.

/** random test ensuring we don't ever split supplementaries */
public void testSurrogates2() throws IOException {
    int numIterations = atLeast(10000);
    for (int i = 0; i < numIterations; i++) {
        if (VERBOSE) {
            System.out.println("\nTEST: iter=" + i);
        }
        String s = TestUtil.randomUnicodeString(random(), 100);
        try (TokenStream ts = analyzer.tokenStream("foo", s)) {
            CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
            ts.reset();
            while (ts.incrementToken()) {
                assertTrue(UnicodeUtil.validUTF16String(termAtt));
            }
            ts.end();
        }
    }
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 32 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestJapaneseTokenizer method makeTokenList.

private ArrayList<String> makeTokenList(Analyzer a, String in) throws Exception {
    ArrayList<String> list = new ArrayList<>();
    TokenStream ts = a.tokenStream("dummy", in);
    CharTermAttribute termAtt = ts.getAttribute(CharTermAttribute.class);
    ts.reset();
    while (ts.incrementToken()) {
        list.add(termAtt.toString());
    }
    ts.end();
    ts.close();
    return list;
}
Also used : TokenStream(org.apache.lucene.analysis.TokenStream) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) ArrayList(java.util.ArrayList)

Example 33 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestCachingTokenFilter method checkTokens.

private void checkTokens(TokenStream stream) throws IOException {
    int count = 0;
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
    while (stream.incrementToken()) {
        assertTrue(count < tokens.length);
        assertEquals(tokens[count], termAtt.toString());
        count++;
    }
    assertEquals(tokens.length, count);
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute)

Example 34 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestCachingTokenFilter method testCaching.

public void testCaching() throws IOException {
    Directory dir = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), dir);
    Document doc = new Document();
    AtomicInteger resetCount = new AtomicInteger(0);
    TokenStream stream = new TokenStream() {

        private int index = 0;

        private CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

        private OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

        @Override
        public void reset() throws IOException {
            super.reset();
            resetCount.incrementAndGet();
        }

        @Override
        public boolean incrementToken() {
            if (index == tokens.length) {
                return false;
            } else {
                clearAttributes();
                termAtt.append(tokens[index++]);
                offsetAtt.setOffset(0, 0);
                return true;
            }
        }
    };
    stream = new CachingTokenFilter(stream);
    doc.add(new TextField("preanalyzed", stream));
    // 1) we consume all tokens twice before we add the doc to the index
    assertFalse(((CachingTokenFilter) stream).isCached());
    stream.reset();
    assertFalse(((CachingTokenFilter) stream).isCached());
    checkTokens(stream);
    stream.reset();
    checkTokens(stream);
    assertTrue(((CachingTokenFilter) stream).isCached());
    // 2) now add the document to the index and verify if all tokens are indexed
    //    don't reset the stream here, the DocumentWriter should do that implicitly
    writer.addDocument(doc);
    IndexReader reader = writer.getReader();
    PostingsEnum termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term1"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, termPositions.freq());
    assertEquals(0, termPositions.nextPosition());
    termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term2"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(2, termPositions.freq());
    assertEquals(1, termPositions.nextPosition());
    assertEquals(3, termPositions.nextPosition());
    termPositions = MultiFields.getTermPositionsEnum(reader, "preanalyzed", new BytesRef("term3"));
    assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS);
    assertEquals(1, termPositions.freq());
    assertEquals(2, termPositions.nextPosition());
    reader.close();
    writer.close();
    // 3) reset stream and consume tokens again
    stream.reset();
    checkTokens(stream);
    assertEquals(1, resetCount.get());
    dir.close();
}
Also used : CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) TextField(org.apache.lucene.document.TextField) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory)

Example 35 with CharTermAttribute

use of org.apache.lucene.analysis.tokenattributes.CharTermAttribute in project lucene-solr by apache.

the class TestPositionIncrement method testSetPosition.

public void testSetPosition() throws Exception {
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new Tokenizer() {

                // TODO: use CannedTokenStream
                private final String[] TOKENS = { "1", "2", "3", "4", "5" };

                private final int[] INCREMENTS = { 1, 2, 1, 0, 1 };

                private int i = 0;

                PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);

                CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);

                OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);

                @Override
                public boolean incrementToken() {
                    if (i == TOKENS.length)
                        return false;
                    clearAttributes();
                    termAtt.append(TOKENS[i]);
                    offsetAtt.setOffset(i, i);
                    posIncrAtt.setPositionIncrement(INCREMENTS[i]);
                    i++;
                    return true;
                }

                @Override
                public void reset() throws IOException {
                    super.reset();
                    this.i = 0;
                }
            });
        }
    };
    Directory store = newDirectory();
    RandomIndexWriter writer = new RandomIndexWriter(random(), store, analyzer);
    Document d = new Document();
    d.add(newTextField("field", "bogus", Field.Store.YES));
    writer.addDocument(d);
    IndexReader reader = writer.getReader();
    writer.close();
    IndexSearcher searcher = newSearcher(reader);
    PostingsEnum pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("1"));
    pos.nextDoc();
    // first token should be at position 0
    assertEquals(0, pos.nextPosition());
    pos = MultiFields.getTermPositionsEnum(searcher.getIndexReader(), "field", new BytesRef("2"));
    pos.nextDoc();
    // second token should be at position 2
    assertEquals(2, pos.nextPosition());
    PhraseQuery q;
    ScoreDoc[] hits;
    q = new PhraseQuery("field", "1", "2");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, using the builder with implicit positions
    PhraseQuery.Builder builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"));
    builder.add(new Term("field", "2"));
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // same as previous, just specify positions explicitely.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 1);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // specifying correct positions should find the phrase.
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "1"), 0);
    builder.add(new Term("field", "2"), 2);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "3");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // phrase query would find it when correct positions are specified. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "4"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    // phrase query should fail for non existing searched term 
    // even if there exist another searched terms in the same searched position. 
    builder = new PhraseQuery.Builder();
    builder.add(new Term("field", "3"), 0);
    builder.add(new Term("field", "9"), 0);
    q = builder.build();
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    // multi-phrase query should succed for non existing searched term
    // because there exist another searched terms in the same searched position. 
    MultiPhraseQuery.Builder mqb = new MultiPhraseQuery.Builder();
    mqb.add(new Term[] { new Term("field", "3"), new Term("field", "9") }, 0);
    hits = searcher.search(mqb.build(), 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "4");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "3", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "4", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(1, hits.length);
    q = new PhraseQuery("field", "2", "5");
    hits = searcher.search(q, 1000).scoreDocs;
    assertEquals(0, hits.length);
    reader.close();
    store.close();
}
Also used : Analyzer(org.apache.lucene.analysis.Analyzer) MockPayloadAnalyzer(org.apache.lucene.analysis.MockPayloadAnalyzer) Document(org.apache.lucene.document.Document) PostingsEnum(org.apache.lucene.index.PostingsEnum) Tokenizer(org.apache.lucene.analysis.Tokenizer) BytesRef(org.apache.lucene.util.BytesRef) Directory(org.apache.lucene.store.Directory) IOException(java.io.IOException) Term(org.apache.lucene.index.Term) PositionIncrementAttribute(org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute) CharTermAttribute(org.apache.lucene.analysis.tokenattributes.CharTermAttribute) OffsetAttribute(org.apache.lucene.analysis.tokenattributes.OffsetAttribute) IndexReader(org.apache.lucene.index.IndexReader) RandomIndexWriter(org.apache.lucene.index.RandomIndexWriter)

Aggregations

CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)144 TokenStream (org.apache.lucene.analysis.TokenStream)88 StringReader (java.io.StringReader)42 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)33 PositionIncrementAttribute (org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute)33 ArrayList (java.util.ArrayList)26 Tokenizer (org.apache.lucene.analysis.Tokenizer)25 IOException (java.io.IOException)22 Analyzer (org.apache.lucene.analysis.Analyzer)18 PayloadAttribute (org.apache.lucene.analysis.tokenattributes.PayloadAttribute)16 BytesRef (org.apache.lucene.util.BytesRef)15 TypeAttribute (org.apache.lucene.analysis.tokenattributes.TypeAttribute)13 LinkedList (java.util.LinkedList)11 FlagsAttribute (org.apache.lucene.analysis.tokenattributes.FlagsAttribute)10 Term (org.apache.lucene.index.Term)10 HashMap (java.util.HashMap)8 Token (org.apache.lucene.analysis.Token)8 Document (org.apache.lucene.document.Document)8 List (java.util.List)7 HashSet (java.util.HashSet)6