Search in sources :

Example 36 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testSingleGraphWithGap.

public void testSingleGraphWithGap() throws Exception {
    // "hey the fast wifi network", where "the" removed
    TokenStream ts = new CannedTokenStream(token("hey", 1, 1), token("fast", 2, 1), token("wi", 1, 1), token("wifi", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "hey", "fast", "wi", "fi", "network" }, new int[] { 1, 2, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "hey", "fast", "wifi", "network" }, new int[] { 1, 2, 1, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 2, 4 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "hey" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "hey") });
    assertFalse(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 2);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 2 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 1);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast") });
    assertTrue(graph.hasSidePath(2));
    it = graph.getFiniteStrings(2, 4);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 1 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(4));
    it = graph.getFiniteStrings(4, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 4);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 37 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testStackedGraphWithGap.

public void testStackedGraphWithGap() throws Exception {
    TokenStream ts = new CannedTokenStream(token("fast", 1, 1), token("wi", 2, 1), token("wifi", 0, 2), token("wireless", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wi", "fi", "network" }, new int[] { 1, 2, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wifi", "network" }, new int[] { 1, 2, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wireless", "network" }, new int[] { 1, 2, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 3 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast") });
    assertTrue(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 3);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 2, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 2 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wireless" }, new int[] { 2 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(3));
    it = graph.getFiniteStrings(3, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 3);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 38 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testGraphWithRegularSynonym.

public void testGraphWithRegularSynonym() throws Exception {
    TokenStream ts = new CannedTokenStream(token("fast", 1, 1), token("speedy", 0, 1), token("wi", 1, 1), token("wifi", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wi", "fi", "network" }, new int[] { 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wifi", "network" }, new int[] { 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "speedy", "wi", "fi", "network" }, new int[] { 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "speedy", "wifi", "network" }, new int[] { 1, 1, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 3 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "speedy" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast"), new Term("field", "speedy") });
    assertTrue(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 3);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 1 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(3));
    it = graph.getFiniteStrings(3, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 3);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 39 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project elasticsearch by elastic.

the class FlattenGraphTokenFilterFactoryTests method testBasic.

public void testBasic() throws IOException {
    Index index = new Index("test", "_na_");
    String name = "ngr";
    Settings indexSettings = newAnalysisSettingsBuilder().build();
    IndexSettings indexProperties = IndexSettingsModule.newIndexSettings(index, indexSettings);
    Settings settings = newAnalysisSettingsBuilder().build();
    // "wow that's funny" and "what the fudge" are separate side paths, in parallel with "wtf", on input:
    TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wtf", 1, 5, 0, 3), token("what", 0, 1, 0, 3), token("wow", 0, 3, 0, 3), token("the", 1, 1, 0, 3), token("fudge", 1, 3, 0, 3), token("that's", 1, 1, 0, 3), token("funny", 1, 1, 0, 3), token("happened", 1, 1, 4, 12) });
    TokenStream tokens = new FlattenGraphTokenFilterFactory(indexProperties, null, name, settings).create(in);
    // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened:
    assertTokenStreamContents(tokens, new String[] { "wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened" }, new int[] { 0, 0, 0, 0, 0, 0, 0, 4 }, new int[] { 3, 3, 3, 3, 3, 3, 3, 12 }, new int[] { 1, 0, 0, 1, 0, 1, 0, 1 }, new int[] { 3, 1, 1, 1, 1, 1, 1, 1 }, 12);
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) IndexSettings(org.elasticsearch.index.IndexSettings) Index(org.elasticsearch.index.Index) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Example 40 with CannedTokenStream

use of org.apache.lucene.analysis.CannedTokenStream in project elasticsearch by elastic.

the class TokenCountFieldMapperTests method testCountPositions.

public void testCountPositions() throws IOException {
    // We're looking to make sure that we:
    // Don't count tokens without an increment
    Token t1 = new Token();
    t1.setPositionIncrement(0);
    Token t2 = new Token();
    // Count normal tokens with one increment
    t2.setPositionIncrement(1);
    Token t3 = new Token();
    // Count funny tokens with more than one increment
    t2.setPositionIncrement(2);
    // Count the final token increment on the rare token streams that have them
    int finalTokenIncrement = 4;
    Token[] tokens = new Token[] { t1, t2, t3 };
    Collections.shuffle(Arrays.asList(tokens), random());
    final TokenStream tokenStream = new CannedTokenStream(finalTokenIncrement, 0, tokens);
    // TODO: we have no CannedAnalyzer?
    Analyzer analyzer = new Analyzer() {

        @Override
        public TokenStreamComponents createComponents(String fieldName) {
            return new TokenStreamComponents(new MockTokenizer(), tokenStream);
        }
    };
    assertThat(TokenCountFieldMapper.countPositions(analyzer, "", ""), equalTo(7));
}
Also used : MockTokenizer(org.apache.lucene.analysis.MockTokenizer) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Token(org.apache.lucene.analysis.Token) Matchers.containsString(org.hamcrest.Matchers.containsString) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) Analyzer(org.apache.lucene.analysis.Analyzer)

Aggregations

CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)78 TokenStream (org.apache.lucene.analysis.TokenStream)43 Token (org.apache.lucene.analysis.Token)37 Directory (org.apache.lucene.store.Directory)33 Document (org.apache.lucene.document.Document)26 TextField (org.apache.lucene.document.TextField)22 Field (org.apache.lucene.document.Field)15 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)14 BytesRef (org.apache.lucene.util.BytesRef)14 FieldType (org.apache.lucene.document.FieldType)13 Term (org.apache.lucene.index.Term)13 MockAnalyzer (org.apache.lucene.analysis.MockAnalyzer)11 Tokenizer (org.apache.lucene.analysis.Tokenizer)11 IndexReader (org.apache.lucene.index.IndexReader)10 Analyzer (org.apache.lucene.analysis.Analyzer)9 StringField (org.apache.lucene.document.StringField)8 RandomIndexWriter (org.apache.lucene.index.RandomIndexWriter)8 Reader (java.io.Reader)7 StringReader (java.io.StringReader)7 Input (org.apache.lucene.search.suggest.Input)7