Search in sources :

Example 81 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testGraphAndGapSameToken.

public void testGraphAndGapSameToken() throws Exception {
    TokenStream ts = new CannedTokenStream(token("fast", 1, 1), token("wi", 2, 1), token("wifi", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wi", "fi", "network" }, new int[] { 1, 2, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wifi", "network" }, new int[] { 1, 2, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 3 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast") });
    assertTrue(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 3);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 2, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 2 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(3));
    it = graph.getFiniteStrings(3, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 3);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 82 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testMultiGraph.

public void testMultiGraph() throws Exception {
    TokenStream ts = new CannedTokenStream(token("turbo", 1, 1), token("fast", 0, 2), token("charged", 1, 1), token("wi", 1, 1), token("wifi", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "turbo", "charged", "wi", "fi", "network" }, new int[] { 1, 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "turbo", "charged", "wifi", "network" }, new int[] { 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wi", "fi", "network" }, new int[] { 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wifi", "network" }, new int[] { 1, 1, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 2, 4 });
    assertTrue(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 2);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "turbo", "charged" }, new int[] { 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 1 });
    assertFalse(it.hasNext());
    assertTrue(graph.hasSidePath(2));
    it = graph.getFiniteStrings(2, 4);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 1 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(4));
    it = graph.getFiniteStrings(4, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 4);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 83 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testSingleGraphWithGap.

public void testSingleGraphWithGap() throws Exception {
    // "hey the fast wifi network", where "the" removed
    TokenStream ts = new CannedTokenStream(token("hey", 1, 1), token("fast", 2, 1), token("wi", 1, 1), token("wifi", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "hey", "fast", "wi", "fi", "network" }, new int[] { 1, 2, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "hey", "fast", "wifi", "network" }, new int[] { 1, 2, 1, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 2, 4 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "hey" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "hey") });
    assertFalse(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 2);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 2 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 1);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast") });
    assertTrue(graph.hasSidePath(2));
    it = graph.getFiniteStrings(2, 4);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 1 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(4));
    it = graph.getFiniteStrings(4, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 4);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 84 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testStackedGraphWithGap.

public void testStackedGraphWithGap() throws Exception {
    TokenStream ts = new CannedTokenStream(token("fast", 1, 1), token("wi", 2, 1), token("wifi", 0, 2), token("wireless", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wi", "fi", "network" }, new int[] { 1, 2, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wifi", "network" }, new int[] { 1, 2, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wireless", "network" }, new int[] { 1, 2, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 3 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast") });
    assertTrue(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 3);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 2, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 2 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wireless" }, new int[] { 2 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(3));
    it = graph.getFiniteStrings(3, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 3);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Example 85 with TokenStream

use of org.apache.lucene.analysis.TokenStream in project lucene-solr by apache.

the class TestGraphTokenStreamFiniteStrings method testGraphWithRegularSynonym.

public void testGraphWithRegularSynonym() throws Exception {
    TokenStream ts = new CannedTokenStream(token("fast", 1, 1), token("speedy", 0, 1), token("wi", 1, 1), token("wifi", 0, 2), token("fi", 1, 1), token("network", 1, 1));
    GraphTokenStreamFiniteStrings graph = new GraphTokenStreamFiniteStrings(ts);
    Iterator<TokenStream> it = graph.getFiniteStrings();
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wi", "fi", "network" }, new int[] { 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast", "wifi", "network" }, new int[] { 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "speedy", "wi", "fi", "network" }, new int[] { 1, 1, 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "speedy", "wifi", "network" }, new int[] { 1, 1, 1 });
    assertFalse(it.hasNext());
    int[] points = graph.articulationPoints();
    assertArrayEquals(points, new int[] { 1, 3 });
    assertFalse(graph.hasSidePath(0));
    it = graph.getFiniteStrings(0, 1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "fast" }, new int[] { 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "speedy" }, new int[] { 1 });
    assertFalse(it.hasNext());
    Term[] terms = graph.getTerms("field", 0);
    assertArrayEquals(terms, new Term[] { new Term("field", "fast"), new Term("field", "speedy") });
    assertTrue(graph.hasSidePath(1));
    it = graph.getFiniteStrings(1, 3);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wi", "fi" }, new int[] { 1, 1 });
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "wifi" }, new int[] { 1 });
    assertFalse(it.hasNext());
    assertFalse(graph.hasSidePath(3));
    it = graph.getFiniteStrings(3, -1);
    assertTrue(it.hasNext());
    assertTokenStream(it.next(), new String[] { "network" }, new int[] { 1 });
    assertFalse(it.hasNext());
    terms = graph.getTerms("field", 3);
    assertArrayEquals(terms, new Term[] { new Term("field", "network") });
}
Also used : CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream) TokenStream(org.apache.lucene.analysis.TokenStream) Term(org.apache.lucene.index.Term) CannedTokenStream(org.apache.lucene.analysis.CannedTokenStream)

Aggregations

TokenStream (org.apache.lucene.analysis.TokenStream)849 StringReader (java.io.StringReader)337 Tokenizer (org.apache.lucene.analysis.Tokenizer)244 Reader (java.io.Reader)175 CharTermAttribute (org.apache.lucene.analysis.tokenattributes.CharTermAttribute)141 MockTokenizer (org.apache.lucene.analysis.MockTokenizer)128 Analyzer (org.apache.lucene.analysis.Analyzer)121 CannedTokenStream (org.apache.lucene.analysis.CannedTokenStream)94 LowerCaseFilter (org.apache.lucene.analysis.LowerCaseFilter)88 IOException (java.io.IOException)86 StandardFilter (org.apache.lucene.analysis.standard.StandardFilter)73 Term (org.apache.lucene.index.Term)66 Document (org.apache.lucene.document.Document)64 ArrayList (java.util.ArrayList)59 StandardTokenizer (org.apache.lucene.analysis.standard.StandardTokenizer)59 StopFilter (org.apache.lucene.analysis.StopFilter)58 KeywordTokenizer (org.apache.lucene.analysis.core.KeywordTokenizer)57 SetKeywordMarkerFilter (org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter)53 Test (org.junit.Test)53 OffsetAttribute (org.apache.lucene.analysis.tokenattributes.OffsetAttribute)47